Compile Scan Lists

from scan_list_comp.Rmd Compile lists of all GluCEST scans.

To DO: - confirm that newer scan dates = Terra - pull scan IDs

Load & Clean Lists

7T list from David (up to ~ summer 2019)

glu_old <- read.csv("data/7t_dates.csv", na.strings = "") # read in 7T list from David (up to ~ summer 2019)
summary(glu_old)
##      BBLID          X7T_date         ONM_7T_date          ONM_SCANID   
##  Min.   : 12882   Length:134         Length:134         Min.   : 8559  
##  1st Qu.: 19792   Class :character   Class :character   1st Qu.: 8928  
##  Median : 86344   Mode  :character   Mode  :character   Median : 9048  
##  Mean   : 64860                                         Mean   : 9159  
##  3rd Qu.: 98273                                         3rd Qu.: 9334  
##  Max.   :139272                                         Max.   :10091  
##                                                         NA's   :68     
##  Terra_SCANID      
##  Length:134        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
# clean up formatting
glu_old <- glu_old %>%
  mutate(BBLID=as.character(BBLID),
  X7T_date = gsub(",.*","", X7T_date), 
  Terra.7T_date = as.Date(X7T_date,format ="%m/%d/%y"),
  ONM.7T_date = as.Date(ONM_7T_date,format ="%m/%d/%y"),
  ONM.SCANID=as.character(ONM_SCANID)) %>%
  rename(Terra.SCANID=Terra_SCANID) %>%
 dplyr::select(-X7T_date, -ONM_7T_date, -ONM_SCANID)

# make long
glu_old.long <- glu_old %>% pivot_longer(2:5, names_sep = '[.]', names_to=c("scanner", ".value")) %>%
  drop_na("7T_date")

7T list from Arianna’s LongGluCEST study (pulled April 2022)

glu_ar <- read.csv("data/longglucest_scans.csv", header=T) # read in 7T list from Arianna LongGluCEST study (pulled April 2022)
summary(glu_ar)
##      BBLID        baseline_clinical    base_7t           follow_7t        
##  Min.   : 20082   Length:35          Length:35          Length:35         
##  1st Qu.: 90350   Class :character   Class :character   Class :character  
##  Median : 96465   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 92511                                                           
##  3rd Qu.:116068                                                           
##  Max.   :135085
#clean up formatting
glu_ar <- glu_ar %>%
  mutate(BBLID=as.character(BBLID),
         base_7t = as.Date(base_7t,format ="%m/%d/%y"),
         follow_7t = as.Date(follow_7t,format ="%m/%d/%y")) %>%
 dplyr::select(-baseline_clinical)

# make long
glu_ar.long <- gather(glu_ar, scanner, "7T_date", base_7t:follow_7t, factor_key=TRUE) %>%
  drop_na("7T_date") %>%
  mutate(scanner= fct_collapse(scanner, Terra = c("base_7t", "follow_7t")))

7T list from Heather’s Aging study

glu_age <- read.csv("data/7tglucestage_scans.csv", header=T) # read in 7T list from Heather Aging study (pulled April 2022)

#clean up
glu_age <- glu_age %>%
  transmute(BBLID=as.character(BBLID),
         "7T_date" = as.Date(DOSCAN,format ="%m/%d/%y"),
         scanner = as.factor("Terra"),
         SCANID=as.character(SCANID))
summary(glu_age)
##     BBLID              7T_date            scanner      SCANID         
##  Length:50          Min.   :2021-04-15   Terra:50   Length:50         
##  Class :character   1st Qu.:2021-07-14              Class :character  
##  Mode  :character   Median :2021-10-27              Mode  :character  
##                     Mean   :2021-10-10                                
##                     3rd Qu.:2021-12-12                                
##                     Max.   :2022-04-01

Join all lists:

#join long versions of old and Longitudinal scan lists
long1 <- merge(x=glu_old.long, y=glu_ar.long, by=c("BBLID", "7T_date", "scanner"),all=TRUE)
str(long1)
## 'data.frame':    182 obs. of  4 variables:
##  $ BBLID  : chr  "100522" "102041" "102041" "105176" ...
##  $ 7T_date: Date, format: "2021-12-11" "2018-06-15" ...
##  $ scanner: chr  "Terra" "Terra" "Terra" "ONM" ...
##  $ SCANID : chr  NA "10821" NA "9332" ...
#join in Age study scan list
all_7t.long <- merge(x=long1, y=glu_age, by=c("BBLID", "7T_date", "scanner", "SCANID"), all=TRUE)
all_7t.long <- all_7t.long %>% rename(date_7t = "7T_date")
str(all_7t.long)
## 'data.frame':    232 obs. of  4 variables:
##  $ BBLID  : chr  "100522" "100898" "102041" "102041" ...
##  $ date_7t: Date, format: "2021-12-11" "2021-11-30" ...
##  $ scanner: chr  "Terra" "Terra" "Terra" "Terra" ...
##  $ SCANID : chr  NA "11977" "10821" NA ...
dup <- duplicated(all_7t.long[,1:2])
any(dup) #no duplicates!
## [1] FALSE

Next make a wide version to easily add in clinical/subject-level data.

all_7t.count <- all_7t.long %>%
  group_by(BBLID) %>%
  arrange(all_7t.long$date_7t) %>%
  mutate(count = as.character(row_number(BBLID)),
         scanner=as.factor(scanner)) %>%
  ungroup() # add counts
all_7t.wide <- pivot_wider(all_7t.count, id_cols=BBLID, names_from = count, values_from=c(date_7t, scanner, SCANID)) # pivot
colnames(all_7t.wide)[2:ncol(all_7t.wide)] <- paste("scan", colnames(all_7t.wide[,c(2:ncol(all_7t.wide))]), sep = "_") #rename cols

# #write final list to csv
# write.csv(all_7t.wide, "data/all_7T_april22.csv", row.names=FALSE, na="")
# save(all_7t.wide, file="data/all_7T_april22.Rdata")

n_distinct(all_7t.wide$BBLID)
## [1] 199

Load diagnosis data from oracle ID and remove 22q subj

#load diagnosis data
redcap <- read.csv("data/oracle_diagnoses_all.csv", header = TRUE, stringsAsFactors = F, na = c("", "null", "..", ".")) %>% 
 dplyr::select(BBLID, DODIAGNOSIS, HSTATUS, AGEONSET_CLINICRISK | contains("AXIS1_")) %>% #get manageable subset of data
  mutate(DODIAGNOSIS = as.Date(DODIAGNOSIS),
         BBLID=as.factor(BBLID))


#make list of 22q subj
q22 <- subset(redcap, HSTATUS == "22q",select = BBLID) %>% unique()

#remove 22q participants from scan list, that way it'll apply to all subsequent merges w/ clinical data
all_7t.wide <- all_7t.wide %>% subset(!(BBLID %in% q22$BBLID)) %>% as.data.frame()#drop 22q sub
all_7t.long <- all_7t.long %>% subset(!(BBLID %in% q22$BBLID)) %>% as.data.frame()#drop 22q sub

There are 197 (197) unique subjects who have 7T scans, excluding 22q pts ID’d from Kosha’s RedCap data.

SIPS/SOPS

Load SIPS data

#load SIPS
sips <- fread("data/oracle_sips_all.csv", na.strings="null", header=TRUE)
  #mutate(DOSIPS = as.Date(DOSIPS),
         #bblid = as.factor(bblid))
sips$BBLID <- as.factor(sips$BBLID)
skim(sips)
Data summary
Name sips
Number of rows 4289
Number of columns 42
Key NULL
_______________________
Column type frequency:
character 11
Date 1
factor 1
numeric 29
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
PROTOCOL 7 1.00 12 25 0 28 0
TYPE 0 1.00 1 10 0 20 0
RATER 0 1.00 2 8 0 60 0
SIPS_SOURCEID 4054 0.05 7 18 0 235 0
SOURCE_PROJECT 4054 0.05 3 10 0 6 0
SPD 659 0.85 1 1 0 2 0
FHPI 964 0.78 1 1 0 2 0
FIRST_DEG_SCZ 1623 0.62 1 1 0 2 0
PRODROMAL 1254 0.71 1 1 0 2 0
CONSIDER 1254 0.71 1 1 0 2 0
REVIEW 1254 0.71 1 1 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DOSIPS 0 1 1970-01-01 2022-04-07 2015-01-21 1919

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
BBLID 0 1 FALSE 1789 109: 19, 919: 14, 196: 13, 117: 13

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
VISITNUM 158 0.96 1.04 0.27 1.0 1.0 1.0 1.0 4.0 ▇▁▁▁▁
P1 6 1.00 1.26 1.74 0.0 0.0 0.0 2.0 9.0 ▇▃▁▁▁
P2 9 1.00 1.09 1.60 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
P3 8 1.00 0.62 1.30 0.0 0.0 0.0 1.0 9.0 ▇▂▁▁▁
P4 11 1.00 1.10 1.74 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
P5 13 1.00 0.88 1.41 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
N1 15 1.00 1.19 1.64 0.0 0.0 1.0 2.0 9.0 ▇▂▁▁▁
N2 14 1.00 1.02 1.57 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
N3 19 1.00 0.88 1.57 0.0 0.0 0.0 1.0 9.0 ▇▂▁▁▁
N4 18 1.00 0.51 1.48 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
N5 16 1.00 1.20 1.69 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
N6 19 1.00 1.30 1.96 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
D1 16 1.00 0.59 1.42 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
D2 17 1.00 0.57 1.47 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
D3 18 1.00 1.38 1.65 0.0 0.0 1.0 2.0 9.0 ▇▅▁▁▁
D4 21 1.00 0.60 1.46 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
G1 20 1.00 1.17 1.69 0.0 0.0 0.0 2.0 9.0 ▇▃▁▁▁
G2 20 1.00 1.07 1.77 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
G3 23 0.99 0.39 1.36 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
G4 22 0.99 0.98 1.76 0.0 0.0 0.0 1.0 9.0 ▇▂▁▁▁
GAF_C 569 0.87 69.31 32.03 -99.0 55.0 70.0 85.0 999.0 ▇▁▁▁▁
GAF_H 986 0.77 71.03 32.85 -99.0 58.0 70.0 86.0 999.0 ▇▁▁▁▁
GAF_PCTCHG 926 0.78 -0.88 5.22 -74.0 0.0 0.0 0.0 95.0 ▁▁▇▁▁
POS_345 1221 0.72 0.55 1.00 0.0 0.0 0.0 1.0 5.0 ▇▁▁▁▁
POS_6 1221 0.72 0.11 0.53 0.0 0.0 0.0 0.0 5.0 ▇▁▁▁▁
NEG_3456 1221 0.72 0.94 1.40 0.0 0.0 0.0 1.0 6.0 ▇▁▁▁▁
DIS_3456 1221 0.72 0.44 0.81 0.0 0.0 0.0 1.0 4.0 ▇▂▁▁▁
GEN_3456 1221 0.72 0.52 0.90 0.0 0.0 0.0 1.0 4.0 ▇▂▁▁▁
POSS_PRO_DX 3167 0.26 348.32 0.11 348.1 348.3 348.4 348.4 348.4 ▂▂▁▂▇

Now for our outcome of interest, SIPS/SOPS scores. First filtering by completion.

#list sops items
plist <- c("P1", "P2", "P3", "P4", "P5")
nlist <- c("N1", "N2", "N3", "N4", "N5", "N6")
dlist <- c("D1", "D2", "D3", "D4")
glist <- c("G1", "G2", "G3", "G4")
sops.item.list <- c(plist, nlist, dlist, glist)

#drop missing sops items
sips_filt <- sips %>% drop_na(c(plist))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(plist)` instead of `plist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
#sum(is.na(sips_filt[,c(sops.item.list)]))
length(unique(sips_filt$BBLID))
## [1] 1787

Some subjects have multiple assessments on one day (one from proband one from family, etc); need to be sure to pick only TYPE == "COMBINED" per discussion with David. If no combined, pick proband.

#if subj has more than one assessment for same day,dplyr::select Combined 
sips_filt<- sips_filt %>% mutate(TYPE = as.factor(TYPE))
levels(sips_filt$TYPE) 
##  [1] "AP"         "both"       "Both"       "collateral" "Collateral"
##  [6] "Combined"   "FC"         "FP"         "FPC"        "GO2MI_2"   
## [11] "GO2YPI_2"   "IC"         "IP"         "IPC"        "MI"        
## [16] "MP"         "P"          "proband"    "Proband"    "YPI"
sips_filt$TYPE <- fct_collapse(sips_filt$TYPE, "Proband" = c("Proband", "proband", "P", "FP", "IP"), "Combined"=c("Combined", "FPC")) #combine all proband
sips_filt$TYPE <- factor(sips_filt$TYPE, levels = c("Combined", "Proband", "Collateral"))

#group data todplyr::select correct type (defined as lowest ordered level) - also cleaning up
sips_filt2 <- sips_filt %>%
  group_by(BBLID, DOSIPS) %>%
  arrange(TYPE) %>%
  slice(c(1)) %>%
  ungroup() %>%
 dplyr::select(-SIPS_SOURCEID, -SOURCE_PROJECT) %>%
  mutate(DOSIPS=as.Date(DOSIPS))

length(unique(sips_filt2$BBLID)) #confirm no subjects deleted
## [1] 1787
skim(sips_filt2)
Data summary
Name sips_filt2
Number of rows 2884
Number of columns 40
_______________________
Column type frequency:
character 8
Date 1
factor 2
numeric 29
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
PROTOCOL 6 1.00 12 25 0 27 0
RATER 0 1.00 2 8 0 60 0
SPD 397 0.86 1 1 0 2 0
FHPI 584 0.80 1 1 0 2 0
FIRST_DEG_SCZ 948 0.67 1 1 0 2 0
PRODROMAL 884 0.69 1 1 0 2 0
CONSIDER 884 0.69 1 1 0 2 0
REVIEW 884 0.69 1 1 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DOSIPS 0 1 1970-01-01 2022-04-07 2015-03-22 1917

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
BBLID 0 1.00 FALSE 1787 109: 8, 145: 7, 164: 7, 177: 7
TYPE 80 0.97 FALSE 3 Pro: 1858, Com: 708, Col: 238

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
VISITNUM 132 0.95 1.04 0.28 1.0 1.0 1.0 1.0 4.0 ▇▁▁▁▁
P1 0 1.00 1.36 1.77 0.0 0.0 1.0 2.0 9.0 ▇▃▁▁▁
P2 0 1.00 1.20 1.61 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
P3 0 1.00 0.64 1.27 0.0 0.0 0.0 1.0 9.0 ▇▂▁▁▁
P4 0 1.00 1.20 1.76 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
P5 0 1.00 0.95 1.35 0.0 0.0 0.0 2.0 9.0 ▇▃▁▁▁
N1 4 1.00 1.20 1.62 0.0 0.0 1.0 2.0 9.0 ▇▂▁▁▁
N2 3 1.00 1.04 1.49 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
N3 4 1.00 0.90 1.47 0.0 0.0 0.0 1.0 9.0 ▇▂▁▁▁
N4 5 1.00 0.50 1.35 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
N5 3 1.00 1.27 1.63 0.0 0.0 1.0 2.0 9.0 ▇▃▂▁▁
N6 5 1.00 1.33 1.94 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
D1 2 1.00 0.58 1.27 0.0 0.0 0.0 1.0 9.0 ▇▁▁▁▁
D2 2 1.00 0.57 1.34 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
D3 4 1.00 1.43 1.57 0.0 0.0 1.0 3.0 9.0 ▇▆▁▁▁
D4 5 1.00 0.56 1.31 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
G1 5 1.00 1.21 1.57 0.0 0.0 0.0 2.0 9.0 ▇▃▁▁▁
G2 5 1.00 1.10 1.67 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
G3 7 1.00 0.36 1.17 0.0 0.0 0.0 0.0 9.0 ▇▁▁▁▁
G4 7 1.00 1.05 1.70 0.0 0.0 0.0 2.0 9.0 ▇▂▁▁▁
GAF_C 392 0.86 68.22 32.19 -99.0 53.0 68.0 83.0 999.0 ▇▁▁▁▁
GAF_H 740 0.74 70.03 33.34 -99.0 57.0 70.0 85.0 999.0 ▇▁▁▁▁
GAF_PCTCHG 658 0.77 -0.96 5.52 -72.0 0.0 0.0 0.0 95.0 ▁▁▇▁▁
POS_345 863 0.70 0.65 1.08 0.0 0.0 0.0 1.0 5.0 ▇▁▁▁▁
POS_6 863 0.70 0.15 0.61 0.0 0.0 0.0 0.0 5.0 ▇▁▁▁▁
NEG_3456 863 0.70 1.03 1.46 0.0 0.0 0.0 2.0 6.0 ▇▁▁▁▁
DIS_3456 863 0.70 0.49 0.84 0.0 0.0 0.0 1.0 4.0 ▇▃▁▁▁
GEN_3456 863 0.70 0.59 0.93 0.0 0.0 0.0 1.0 4.0 ▇▂▁▁▁
POSS_PRO_DX 2100 0.27 348.32 0.11 348.1 348.3 348.4 348.4 348.4 ▂▁▁▂▇

Calculate SOPS scores by summing within each subscale (seems to be valid metric based on clinicaltrials.gov)

#calculate sops scores
sips_scored<- sips_filt2 %>% mutate(sops_p = rowSums(across(plist), na.rm=F),
                                  sops_n = rowSums(across(nlist), na.rm=F),
                                  sops_d = rowSums(across(dlist), na.rm=F),
                                  sops_g = rowSums(across(glist), na.rm=F),
                                  sops_tot = rowSums(across(c(sops_p, sops_n, sops_d, sops_g), na.rm=F)))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(nlist)` instead of `nlist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(dlist)` instead of `dlist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(glist)` instead of `glist` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
#also dropping `3456` cols (and maybe GAF_PCTCHG?) since they don't seem to mean anything or be useful and it would be better to recalculate them anyway
#aksi dropping rater
sips_scored <- sips_scored %>%dplyr::select(-VISITNUM, -RATER, -REVIEW, -POS_345, -POS_6, - NEG_3456, -DIS_3456, - GEN_3456) %>% as.data.frame()

head(sips_scored)
##   BBLID             PROTOCOL     DOSIPS    TYPE P1 P2 P3 P4 P5 N1 N2 N3 N4 N5
## 1 11287 804019 - NAYA Assess 2006-01-30 Proband  2  0  2  0  0  0  2  0  0  3
## 2 11636         812481 - 22q 2011-02-09 Proband  6  6  3  6  5  5  3  1  5  1
## 3 11671 804019 - NAYA Assess 2006-08-12 Proband  4  0  1  5  3  1  0  0  2  5
## 4 11671 804019 - NAYA Assess 2008-04-17 Proband  0  3  0  1  0  3  0  1  0  3
## 5 11706         812481 - 22q 2011-04-27 Proband  0  0  0  1  0  0  0  0  0  0
## 6 11936 804019 - NAYA Assess 2008-05-05 Proband  6  6  0  1  0  5  5  0  0  0
##   N6 D1 D2 D3 D4 G1 G2 G3 G4 GAF_C GAF_H GAF_PCTCHG SPD FHPI FIRST_DEG_SCZ
## 1  0  0  0  1  0  0  0  0  0    85    85          0   N    Y             Y
## 2  2  5  2  4  1  3  5  3  4    47    47          0   N    N             N
## 3  6  2  0  0  2  3  0  0  1    47    45          4   N    Y             Y
## 4  0  0  0  0  0  0  0  0  0    63    63          0   N    Y             Y
## 5  0  0  0  0  0  0  0  0  0    83    83          0   N    N             N
## 6  6  0  2  3  0  0  6  0  3    41    55        -25   N    Y             N
##   PRODROMAL CONSIDER POSS_PRO_DX sops_p sops_n sops_d sops_g sops_tot
## 1         N        N          NA      4      5      1      0       10
## 2         N        Y          NA     26     17     12     15       70
## 3         Y        N       348.4     13     14      4      4       35
## 4         Y        N       348.4      4      7      0      0       11
## 5         N        N          NA      1      0      0      0        1
## 6         N        Y          NA     13     16      5      9       43

Merging into long scanlist and filtering for scans with baseline w/in 1 yr and at least one follow-up 1yr or more later

sips_scan_long <- inner_join(all_7t.long, sips_scored, by="BBLID")
head(sips_scan_long)
##    BBLID    date_7t scanner SCANID                PROTOCOL     DOSIPS    TYPE
## 1 100522 2021-12-11   Terra   <NA>        833922 - EvolPsy 2021-03-30 Proband
## 2 100898 2021-11-30   Terra  11977        833922 - EvolPsy 2021-10-08 Proband
## 3 102041 2018-06-15   Terra  10821            810336 - Go3 2015-12-11 Proband
## 4 102041 2022-01-25   Terra   <NA>            810336 - Go3 2015-12-11 Proband
## 5 102041 2022-03-24   Terra  12139            810336 - Go3 2015-12-11 Proband
## 6 105176 2015-02-26     ONM   9332 810336 - Go2 Supplement 2013-06-22 Proband
##   P1 P2 P3 P4 P5 N1 N2 N3 N4 N5 N6 D1 D2 D3 D4 G1 G2 G3 G4 GAF_C GAF_H
## 1  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0    NA    NA
## 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    NA    NA
## 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    94    94
## 4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    94    94
## 5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0    94    94
## 6  0  0  0  2  0  1  2  0  0  2  2  0  0  1  0  2  1  0  0    75    75
##   GAF_PCTCHG SPD FHPI FIRST_DEG_SCZ PRODROMAL CONSIDER POSS_PRO_DX sops_p
## 1         NA   N    N             N      <NA>     <NA>          NA      0
## 2         NA   N    N             N      <NA>     <NA>          NA      0
## 3          0   N    N             N         N        N          NA      0
## 4          0   N    N             N         N        N          NA      0
## 5          0   N    N             N         N        N          NA      0
## 6          0   N    N          <NA>         N        N          NA      2
##   sops_n sops_d sops_g sops_tot
## 1      0      1      0        1
## 2      0      0      0        0
## 3      0      0      0        0
## 4      0      0      0        0
## 5      0      0      0        0
## 6      7      1      3       13
#calc diff from each 7T scan (negative values -> dx before 7t)
sips_scan_long <- sips_scan_long %>%
  mutate(sips_diff_days = as.numeric(difftime(DOSIPS,date_7t, units = "days")),
         sips_diff_yrs = round((as.numeric(difftime(DOSIPS,date_7t, units = "weeks"))/52.25)/.5)*.5) %>% #time diff rounded to half-yr
  rename(date7t=date_7t)
         
sip.pre.scan_l <-  which(sips_scan_long$sips_diff_days <= 364 & sips_scan_long$sips_diff_days >= -365)
sip.post.scan_l <-  which(sips_scan_long$sips_diff_days > 364)

sips_scan_long.lab <- sips_scan_long %>%
  mutate(scan.time = case_when(row_number() %in% sip.pre.scan_l ~ "base", 
                                row_number() %in% sip.post.scan_l ~ "end"),) %>% #label every sops timepoint relative to each scan
  filter(!is.na(scan.time)) %>% #drop sops that don't meet criteria for base or endpoint 
  group_by(BBLID, date7t) %>%
  arrange(abs(sips_diff_days)) %>%
  mutate(row = row_number()) %>%
  filter(!(scan.time=="base" & row > 1)) %>% #if subject has more than one baseline SOPS, just get closest to scan
  ungroup() %>%
 dplyr::select(-row)

table(as.factor(sips_scan_long.lab$scan.time))
## 
## base  end 
##  120   58
n_unique(sips_scan_long.lab$BBLID)
## [1] 110
#drop scans with less than 2 timepoints
scanlist_clean <- sips_scan_long.lab %>%
  group_by(BBLID, date7t) %>%
  arrange(abs(sips_diff_days)) %>%
  filter(first(scan.time)=="base" & n() >= 2) %>% #only keep BBLID-scan pairs with 2+ timepoints, the first being baseline
  ungroup() %>%
  group_by(BBLID, date7t, scan.time) %>%
  mutate(row = row_number(),
         scan.time=paste(scan.time,as.character(row), sep="."), #number endpoints if subj has more than one
         observation = paste(as.character(BBLID), as.character(date7t), scan.time)) %>% #add indicator for observation
  ungroup() %>%
 dplyr::select(-row)

table(as.factor(scanlist_clean$scan.time))
## 
## base.1  end.1  end.2  end.3  end.4  end.5 
##     32     32      8      4      3      1
n_unique(scanlist_clean$BBLID)
## [1] 31
skim(scanlist_clean)
Data summary
Name scanlist_clean
Number of rows 80
Number of columns 44
_______________________
Column type frequency:
character 11
Date 2
factor 1
numeric 30
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
BBLID 0 1.00 5 6 0 31 0
scanner 0 1.00 3 5 0 2 0
SCANID 0 1.00 4 5 0 32 0
PROTOCOL 0 1.00 12 25 0 12 0
SPD 10 0.88 1 1 0 1 0
FHPI 18 0.78 1 1 0 2 0
FIRST_DEG_SCZ 22 0.72 1 1 0 2 0
PRODROMAL 35 0.56 1 1 0 2 0
CONSIDER 35 0.56 1 1 0 2 0
scan.time 0 1.00 5 6 0 6 0
observation 0 1.00 22 24 0 80 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date7t 0 1 2013-10-11 2018-12-18 2014-11-26 30
DOSIPS 0 1 2013-01-10 2022-02-18 2018-02-18 78

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
TYPE 0 1 FALSE 3 Pro: 69, Com: 10, Col: 1

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
P1 0 1.00 1.94 1.89 0.0 0.00 2.0 3.00 6.0 ▇▅▁▁▂
P2 0 1.00 1.35 1.54 0.0 0.00 1.0 2.00 6.0 ▇▂▂▁▁
P3 0 1.00 0.96 1.52 0.0 0.00 0.0 2.00 6.0 ▇▁▁▁▁
P4 0 1.00 1.50 1.86 0.0 0.00 0.0 3.00 6.0 ▇▁▂▁▁
P5 0 1.00 1.34 1.53 0.0 0.00 1.0 2.00 6.0 ▇▂▂▁▁
N1 0 1.00 1.04 1.26 0.0 0.00 1.0 2.00 5.0 ▇▃▁▁▁
N2 0 1.00 0.60 1.06 0.0 0.00 0.0 1.00 4.0 ▇▂▁▁▁
N3 0 1.00 0.80 1.13 0.0 0.00 0.0 1.00 4.0 ▇▃▂▂▁
N4 0 1.00 0.62 1.29 0.0 0.00 0.0 0.00 5.0 ▇▁▁▁▁
N5 0 1.00 0.74 1.13 0.0 0.00 0.0 1.00 4.0 ▇▂▂▁▁
N6 0 1.00 1.26 2.14 0.0 0.00 0.0 2.00 6.0 ▇▁▁▁▂
D1 0 1.00 0.50 1.01 0.0 0.00 0.0 1.00 5.0 ▇▁▁▁▁
D2 0 1.00 0.75 1.49 0.0 0.00 0.0 1.00 6.0 ▇▁▁▁▁
D3 1 0.99 0.94 1.26 0.0 0.00 0.0 2.00 4.0 ▇▂▂▂▁
D4 0 1.00 0.34 0.79 0.0 0.00 0.0 0.00 4.0 ▇▁▁▁▁
G1 0 1.00 0.89 1.47 0.0 0.00 0.0 2.00 6.0 ▇▁▂▁▁
G2 0 1.00 0.91 1.41 0.0 0.00 0.0 2.00 6.0 ▇▂▁▁▁
G3 0 1.00 0.00 0.00 0.0 0.00 0.0 0.00 0.0 ▁▁▇▁▁
G4 0 1.00 0.75 1.19 0.0 0.00 0.0 1.00 5.0 ▇▁▁▁▁
GAF_C 20 0.75 65.17 18.45 21.0 50.00 67.5 80.25 95.0 ▁▇▆▇▇
GAF_H 26 0.68 68.74 17.07 21.0 57.00 70.0 81.00 95.0 ▁▅▆▇▇
GAF_PCTCHG 25 0.69 -0.67 3.71 -25.0 0.00 0.0 0.00 0.0 ▁▁▁▁▇
POSS_PRO_DX 58 0.28 348.35 0.11 348.1 348.40 348.4 348.40 348.4 ▂▁▁▁▇
sops_p 0 1.00 7.09 6.62 0.0 2.00 5.5 10.25 29.0 ▇▅▂▁▁
sops_n 0 1.00 5.06 4.57 0.0 1.00 5.0 8.00 21.0 ▇▆▂▁▁
sops_d 1 0.99 2.53 3.08 0.0 0.00 1.0 4.00 13.0 ▇▃▁▁▁
sops_g 0 1.00 2.55 3.15 0.0 0.00 1.0 4.00 12.0 ▇▂▂▁▁
sops_tot 1 0.99 17.29 14.90 0.0 5.50 14.0 26.50 74.0 ▇▅▂▁▁
sips_diff_days 0 1.00 723.59 821.16 -274.0 -37.75 537.0 1273.75 2799.0 ▇▆▃▂▂
sips_diff_yrs 0 1.00 1.99 2.24 -0.5 0.00 1.5 3.50 7.5 ▇▅▃▁▂

Keeping data long for now for mixed random effects model!

Adding Covariates

Demographics

Adding sex, DOB/age, race (csv adapted from 7T list from David).

#load demographics (adapted from 7T list from David)
demo.phi <- read.csv("data/7t_demographics.csv", header = TRUE, stringsAsFactors = TRUE, na = c("", "..", ".")) %>%
  mutate(sex = as.factor(Sex_1M),
         Ethnicity = as.factor(Ethnicity),
         DOB = as.Date(DOB, format = "%m/%d/%y"),
         BBLID = as.factor(BBLID)) %>%
 dplyr::select(-Diagnostic_Group, -sex, -Age)

sops_demo.phi <- left_join(scanlist_clean, demo.phi, by="BBLID")
#skim(sops_demo.phi)
n_unique(sops_demo.phi$observation)
## [1] 80

Demographic variables have 100% completion.

Using DOB to calculate age at scan and at endpoint SOPS, then removing.

sops_demo <- sops_demo.phi %>%
  mutate(age_scan = as.numeric(difftime(date7t,DOB, units = "weeks"))/52.25,
         age_sops = as.numeric(difftime(DOSIPS,DOB, units = "weeks"))/52.25) %>%
 dplyr::select(-DOB)

summary(sops_demo$age_scan)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.70   16.79   19.49   19.65   21.40   26.02
summary(sops_demo$age_sops)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.44   19.11   21.46   21.63   23.67   29.42

Diagnosis

Now merge in diagnosis and filter to smallest datediff

#get just dates/matching info for observations
just_scans <- sops_demo %>%dplyr::select(BBLID, observation, DOSIPS, date7t, scan.time)

sops_dx <- left_join(just_scans, redcap, by="BBLID")
any(is.na(sops_dx$AXIS1_DX1)) #no primary dx missing
## [1] FALSE
#calculate datediff from dx to sops, keep just row closest
sops_dx.filt <- sops_dx %>%
  mutate(dx_sops_diff = as.numeric(difftime(DODIAGNOSIS,DOSIPS, units = "days"))) %>%
  arrange(abs(dx_sops_diff)) %>%
  group_by(observation) %>%
  slice_head() %>% #keep only closest dx
  ungroup() %>%
  filter(abs(dx_sops_diff) <= 100) %>% #restrict to w/in 100 days
 dplyr::select_if(~!all(is.na(.)))

sops_dx.filt %>% 
  ggplot(aes(x=dx_sops_diff, fill=scan.time)) + geom_bar() + facet_grid(cols = vars(scan.time))

#focusing on baseline
sops_dx.filt %>% filter(scan.time=="base.1") %>%
  mutate(dx_to_scan_diff = as.numeric(difftime(DODIAGNOSIS,date7t, units = "days"))) %>%
  ggplot(aes(x=dx_to_scan_diff)) + geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

#merge back in
n_unique(sops_dx.filt$observation)
## [1] 75
summary(sops_dx.filt$dx_sops_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -7.00000  0.00000  0.00000  0.06667  0.00000  8.00000
#skim(sops_change_dx)

Now merge back in

#drop stuff that's already there before merging
sops_dx.filt <- sops_dx.filt %>%
 dplyr::select(-BBLID, -DOSIPS, -date7t, -scan.time)

sops_dx.final <- left_join(sops_demo, sops_dx.filt, by="observation")
#skim(sops_dx.final)
n_unique(sops_dx.final$observation)
## [1] 80

First I’ll make an indicator for psychosis/prodrome at baseline or endpoint (since HSTATUS cols aren’t really great).

sops_dx.final %>%dplyr::select(contains("AXIS1_DX")) %>% 
  t %>% c %>% unique
##  [1] "348.4"   NA        "348.43"  "799.90B" "348.62"  "300.23"  "304.3"  
##  [8] "298.9"   "295.6"   "311"     "V71.09B" "296.26"  "305"     "314"    
## [15] "348.12"  "296.25"  "296.2"   "296.9"   "348.41"  "309.21"  "295.4"  
## [22] "V62.82"  "309.81"  "300.02"  "295.9"   "314.01"  "303.9"   "348.11" 
## [29] "296.32"  "300"     "296.35"  "300.3"   "348.42"
control.list <- c("V71.09B")
pro.list<- c("348.4", "348.43", "348.62", "348.42", "348.12", "348.41", "348.11")
psych.list <- c("295.6", "298.9", "295.4", "295.9")
other.list <- c("300", "300.23", "311", "309.21", "304.3", "296.35", "V62.82", "309.81", "300.02", "305", "296.36", "303.9", "296.32", "314", "296.25", "296.2", "296.26", "300.3", "296.9", "314.01", "799.90B")

#making indicator for each dx category
sops_dx.sum <- sops_dx.final %>% 
  mutate(dx_control = if_any(starts_with("AXIS1_DX"), ~ .x %in% control.list,1,0),
         dx_pro = if_any(starts_with("AXIS1_DX"), ~ .x %in% pro.list,1,0),
         dx_psych = if_any(starts_with("AXIS1_DX"), ~ .x %in% psych.list,1,0),
         dx_other = if_any(starts_with("AXIS1_DX"), ~ .x %in% other.list,1,0)) %>%
  mutate(diagnosis = as.factor(case_when(dx_pro == TRUE ~ "pro", #single col of dx
                                 dx_psych == TRUE ~ "psych",
                                 dx_other == TRUE ~ "other",
                                 dx_control == TRUE ~ "none")))

#adding a base_dx indicator, since if we're predicting outcomes we'd know this (though maybe v correlated with intercept)

sops_dx.sum <- sops_dx.sum %>%
  group_by(BBLID, SCANID) %>%
  mutate(base_dx=first(diagnosis),
         base_gafc=first(GAF_C),
         base_sops_p=first(sops_p),
         delta_sops=sops_p-base_sops_p,
         sops_cat = as.factor(case_when(delta_sops<0 ~ "remit",
                                        delta_sops==0 ~ "stable",
                                        delta_sops>0 ~ "progress")))%>%
  ungroup()

sops_dx.sum %>% 
  ggplot(aes(y=BBLID, x=scan.time, color=diagnosis)) + geom_point()

Diagnosis amongst baseline prodromal and psychotic disorder subjects seem to be largely stable over time.

And drop all the extra dx columns we don’t need

sops_dx.sum.filt <- sops_dx.sum %>%
 dplyr::select(!contains("AXIS1") & !contains("HSTATUS") & !contains("CONSIDER") & !contains("PRODROMAL") & !contains("POSS_PRO_DX"))

Medication Info

Load and clean med data

#load med data
meds <- read.csv("data/oracle_meds_all.csv", header = TRUE, stringsAsFactors = F, na = c("", "null"))
skim(meds)
Data summary
Name meds
Number of rows 17469
Number of columns 20
_______________________
Column type frequency:
character 15
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
COLLECTID 2 1.00 1 41 0 5829 0
BBLID 23 1.00 2 29 0 3818 0
PROTOCOL 11301 0.35 12 79 0 48 0
DOCOLLECT 31 1.00 10 21 0 3180 0
COLLECTBY 2202 0.87 1 8 0 184 0
MED_COLLECT 153 0.99 1 1 0 2 0
ENTBY 112 0.99 1 8 0 100 0
DOENT 12 1.00 10 10 0 2809 0
NOTES 7225 0.59 1 138 0 128 15
MEDICINE 2576 0.85 2 114 0 1278 0
PHARMCLASS 4245 0.76 6 1030 0 276 32
DOMED_START 3360 0.81 10 21 0 4470 0
DOMED_END 6620 0.62 10 21 0 4435 0
IS_CURRENT 2649 0.85 1 1 0 3 0
COMMENTS 15786 0.10 1 252 0 1087 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
VISITNUM 13651 0.22 1.63 17.49 0 1.00 1 1.00 900 ▇▁▁▁▁
DOSAGE 4553 0.74 494.13 1833.23 0 4.00 20 150.00 24000 ▇▁▁▁▁
ROUTE 4758 0.73 1.05 0.37 0 1.00 1 1.00 9 ▇▁▁▁▁
DURATION 6628 0.62 14.07 35.23 -9 0.00 2 12.00 481 ▇▁▁▁▁
MEDICINEID 2581 0.85 7780.22 4662.10 1 3740.75 7476 11900.25 15986 ▇▇▇▇▇
#each med is it's own row, so need to rotate wide
meds <- meds %>%
  group_by(BBLID, DOCOLLECT) %>%
  mutate(ID = paste(as.character(BBLID), as.character(DOCOLLECT)), #col to ID subj+timepoint
         count = row_number(), #count meds entered
         DOCOLLECT= as.Date(DOCOLLECT)) %>% 
  ungroup()

hist(meds$count)

medswide <- meds %>%
 dplyr::select(BBLID, DOCOLLECT, ID, count, MEDICINE, NOTES, IS_CURRENT) %>%
  mutate(BBLID=as.factor(BBLID)) %>%
  pivot_wider(., names_from = count, values_from=c(MEDICINE, NOTES, IS_CURRENT)) # pivot

#get subset
meds_reduced <- meds %>%
 dplyr::select(BBLID, DOCOLLECT, MED_COLLECT, NOTES, MEDICINE, PHARMCLASS, DOMED_START, DOMED_END, IS_CURRENT)

Trying to add PNC data to see if that will help get more baseline info

pnc.meds <- read.csv("data/n1601_health_with_meds_20170421.csv", header = TRUE, stringsAsFactors = F, na ="") %>%
  mutate(BBLID=as.factor(bblid)) %>%
 dplyr::select(-bblid)
skim(pnc.meds)
Data summary
Name pnc.meds
Number of rows 1601
Number of columns 30
_______________________
Column type frequency:
character 11
factor 1
numeric 18
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
med1 726 0.55 3 27 0 233 0
med2 1081 0.32 3 27 0 178 0
med3 1295 0.19 3 29 0 127 0
med4 1424 0.11 3 29 0 92 0
med5 1503 0.06 4 26 0 62 0
med6 1542 0.04 6 26 0 46 0
med7 1564 0.02 3 35 0 26 0
med8 1586 0.01 3 13 0 14 0
med9 1591 0.01 6 13 0 9 0
med10 1595 0.00 4 15 0 6 0
med11 1597 0.00 6 10 0 4 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
BBLID 0 1 FALSE 1601 800: 1, 801: 1, 801: 1, 802: 1

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
scanid 0 1 5182.26 1436.06 2632 4004 5148 6215 8470 ▆▇▇▆▃
smrytrt_psychinpt 0 1 0.04 0.19 0 0 0 0 1 ▇▁▁▁▁
psychoactiveMedPsychv2 0 1 0.11 0.32 0 0 0 0 1 ▇▁▁▁▁
psychoactiveMedMedicalv2 0 1 0.04 0.20 0 0 0 0 1 ▇▁▁▁▁
incidentalFindingExclude 0 1 0.01 0.11 0 0 0 0 1 ▇▁▁▁▁
medicalratingExcludev1 0 1 0.05 0.22 0 0 0 0 1 ▇▁▁▁▁
healthExcludev2 0 1 0.10 0.29 0 0 0 0 1 ▇▁▁▁▁
ltnExcludev2 0 1 0.21 0.41 0 0 0 0 1 ▇▁▁▁▂
squeakycleanExclude 0 1 0.74 0.44 0 0 1 1 1 ▃▁▁▁▇
psychoactiveMedMedical 0 1 0.04 0.20 0 0 0 0 1 ▇▁▁▁▁
medclass_Antipsychotic 0 1 0.02 0.13 0 0 0 0 1 ▇▁▁▁▁
medclass_Anticonvulsant 0 1 0.01 0.11 0 0 0 0 1 ▇▁▁▁▁
medclass_Antidepressant 0 1 0.03 0.17 0 0 0 0 1 ▇▁▁▁▁
medclass_Benzodiazepine 0 1 0.00 0.07 0 0 0 0 1 ▇▁▁▁▁
medclass_Stimulant 0 1 0.07 0.26 0 0 0 0 1 ▇▁▁▁▁
medclass_NonstimulantADHDmed 0 1 0.02 0.13 0 0 0 0 1 ▇▁▁▁▁
medclass_Other 0 1 0.00 0.04 0 0 0 0 1 ▇▁▁▁▁
medclass_Lithium 0 1 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
#get managable subset of scan list
scan_dates <- sops_dx.sum.filt %>%
 dplyr::select(BBLID, date7t, observation) %>%
  distinct()

pnc.meds_scan_date <- inner_join(scan_dates, pnc.meds, by="BBLID") %>%
  distinct(observation, .keep_all = T)
n_unique(pnc.meds_scan_date$BBLID)
## [1] 27

Inferring based on PNC data collection window that all meds are within study range, will add into lifetime calculations.

Lets go back and look at lifetime:

#merge with scan lists
med_dates <- inner_join(just_scans, meds_reduced, by="BBLID")
n_unique(med_dates$observation)
## [1] 80
med_lifetime <- med_dates %>%
  group_by(observation) %>%
  mutate(meds_sops_diff=as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days"))) %>% #DOCOLLECT - DOSIPS, so if + then DOCOLLECT is after SOPS
  filter((is.na(DOMED_START) & meds_sops_diff <= 100) | DOSIPS >= DOMED_START) %>% #keep if med is STARTED BEFORE SOPS (or start date unknown but collect is no later than 100 days after)
  mutate(count = row_number()) %>% #add counts so i can pivot after
  ungroup()
  
med_lifetime$MEDICINE[is.na(med_lifetime$MEDICINE)] <- "empty entry"
skim(med_lifetime)
Data summary
Name med_lifetime
Number of rows 383
Number of columns 15
_______________________
Column type frequency:
character 10
Date 3
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
BBLID 0 1.00 5 6 0 31 0
observation 0 1.00 22 24 0 77 0
scan.time 0 1.00 5 6 0 6 0
MED_COLLECT 8 0.98 1 1 0 2 0
NOTES 365 0.05 1 138 0 7 3
MEDICINE 0 1.00 4 37 0 74 0
PHARMCLASS 249 0.35 14 332 0 29 0
DOMED_START 227 0.41 10 10 0 47 0
DOMED_END 333 0.13 10 10 0 20 0
IS_CURRENT 164 0.57 1 1 0 3 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
DOSIPS 0 1 2014-01-08 2022-02-18 2019-01-29 75
date7t 0 1 2013-10-11 2018-12-18 2015-08-26 30
DOCOLLECT 0 1 2013-11-14 2022-03-08 2018-03-27 133

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
meds_sops_diff 0 1 -492.30 806.47 -2600 -1016 -364 0 2198 ▂▃▇▁▁
count 0 1 6.06 6.38 1 2 4 8 31 ▇▂▁▁▁
#trying to find distinct meds for each observation
med_lifetime_dist <- med_lifetime %>%
  group_by(observation) %>%
  distinct(MEDICINE, .keep_all = T) %>% #keep distinct
  mutate(count=row_number()) %>% #update counts so i can pivot after
  ungroup() %>%
 dplyr::select(BBLID, DOSIPS, observation, MEDICINE, NOTES, count)
table(med_lifetime_dist$MEDICINE)
## 
##  LEVONORGESTREL AND ETHINYL ESTRADIOL                               ABILIFY 
##                                     1                                     6 
##                              ADDERALL                           ADDERALL XR 
##                                     4                                     1 
##                                ADVAIR                             ALBUTERAL 
##                                     4                                     2 
##                             ALBUTEROL                      ALLERGY MEDS OTC 
##                                     7                                     3 
##                                AVIANE                              BENADRYL 
##                                     2                                     2 
##                         BIRTH CONTROL                        BIRTH CONTROL  
##                                     2                                     6 
##                             BUSPIRONE                               BUTEROL 
##                                     1                                     2 
##                    CALCIUM SUPPLEMENT                            CARVEDILOL 
##                                     2                                     1 
##                             CIMBACORT                             CLONIDINE 
##                                     2                                     2 
##                              COGENTIN                              CONCERTA 
##                                     2                                     2 
##                                 DDAVP                             DEXEDRINE 
##                                     4                                     2 
##      DEXMETHYLPHENIDATE HYDROCHLORIDE                            DICLAFONEC 
##                                     1                                     2 
##                            DICLOFENAC                           empty entry 
##                                     2                                    63 
##                       FERROUS SULFATE                               FLONASE 
##                                     2                                     8 
##                               FLOVENT                             IBUPROFEN 
##                                     7                                     2 
##                                  IRON                             ISENTRESS 
##                                     2                                     2 
##                            LISINOPRIL    LISINOPRIL AND HYDROCHLOROTHIAZIDE 
##                                     2                                     1 
##                       LOESTRIN 1.5/30                            LORATADINE 
##                                     4                                     2 
##             LORATADINE ALLERGY RELIEF                            LORATIDINE 
##                                     2                                     2 
##                             MACA ROOT                           MANE CHOICE 
##                                     2                                     1 
##                             MELATONIN                           MICROGESTIN 
##                                     1                                     4 
##                                MOTRIN             MULTI VITAMIN AND MINERAL 
##                                     5                                     2 
##                          MULTIVITAMIN            MULTIVITAMIN WITH FLUORIDE 
##                                     2                                     2 
##                       MUSCLE RELAXERS                            NEXAPLANON 
##                                     2                                     2 
##                                  NONE                 NORETHINDRONE ACETATE 
##                                     3                                     4 
##                   ORTHO TRI CYCLEN LO                             OXYCODONE 
##                                     2                                     2 
##   OXYCODONE HYDROCHLORIDE AND ASPIRIN                       PROMETHAZINE VC 
##                                     1                                     1 
##                                PROZAC                            RANITIDINE 
##                                     4                                     2 
##                              RATIDINE                           RISPERIDONE 
##                                     2                                     2 
## RISPERIDONE ORAL DISINTEGRATE TABLETS                              SEROQUEL 
##                                     2                                     2 
##                              SPRINTEC                                SRONYX 
##                                     3                                     4 
##                             SYMBICORT                                 TENEX 
##                                     2                                     2 
##                           TRILOMARZIA                               TRUVADA 
##                                     2                                     2 
##                               TYLENOL                             VITAMIN D 
##                                     5                                     1 
##                               VYVANSE                            WELLBUTRIN 
##                                     2                                     2 
##                                 XANAX                                ZOLOFT 
##                                     2                                     2 
##                                ZYRTEC                              ZYRTEC-D 
##                                     6                                     3
#pivot
med_lifetime_dist.wide <- pivot_wider(med_lifetime_dist, id_cols=observation, names_from = count, values_from=c(MEDICINE, NOTES)) %>% # pivot
 dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty
n_unique(med_lifetime_dist.wide$observation)
## [1] 77
n_unique(med_lifetime_dist$BBLID)
## [1] 31

If an individual only has empty entry value then we can infer that they are not taking any medications. Restricting to either date of collection no later than 100 days post-SOPS OR medicaiton being noted as starting before the SOPS date, there’s med info for all subjects but not for all observations.

Integrate PNC meds:

pnc.meds.only <- pnc.meds_scan_date %>%
 dplyr::select(observation, 22:32)
pnc.meds.only$med1[is.na(pnc.meds.only$med1)] <- "empty entry"

med_lifetime_comp <- full_join(med_lifetime_dist.wide, pnc.meds.only, by="observation")

#check if any observations are still missing med info:
med_lifetime_comp %>% filter(is.na(MEDICINE_1) & is.na(med1)) %>% nrow
## [1] 0

Yay, no observations are unaccounted for. Save out and manually code for psych rx:

write.csv(med_lifetime_comp, "data/med_lifetime_comp.csv", row.names=FALSE, na="")

Read back in and merge coded lifetime meds:

meds.coded <- read.csv("data/med_lifetime_comp.CODED.csv", header = TRUE, stringsAsFactors = F, na="") %>%
 dplyr::select(observation, psych.rx_life) %>%
  mutate(psych.rx_life=as.factor(psych.rx_life))

sops_change_meds.life <- left_join(sops_dx.sum.filt, meds.coded, by="observation")
sops_change_meds.life <- sops_change_meds.life %>%
  group_by(BBLID, SCANID) %>%
  mutate(base_rx=first(psych.rx_life)) %>%
  ungroup()

n_unique(sops_change_meds.life$observation)
## [1] 80
#skim(sops_change_meds.life)

CEST values

Read in processed CEST values from Harvard Oxford Subcortical Atlas ROIs:

cest.rois <- read.csv("data/GluCEST-HarvardOxford-Subcortical-Measures.csv", header = TRUE, stringsAsFactors = F, sep = '\t', na="NaN")
skim(cest.rois)
Data summary
Name cest.rois
Number of rows 26
Number of columns 64
_______________________
Column type frequency:
character 1
logical 27
numeric 36
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Subject 0 1 86 90 0 26 0

Variable type: logical

skim_variable n_missing complete_rate mean count
L_Caudate_mean 26 0 NaN :
L_Caudate_numvoxels 26 0 NaN :
L_Caudate_SD 26 0 NaN :
L_Putamen_mean 26 0 NaN :
L_Putamen_numvoxels 26 0 NaN :
L_Putamen_SD 26 0 NaN :
L_Pallidum_mean 26 0 NaN :
L_Pallidum_numvoxels 26 0 NaN :
L_Pallidum_SD 26 0 NaN :
L_Hipp_mean 26 0 NaN :
L_Hipp_numvoxels 26 0 NaN :
L_Hipp_SD 26 0 NaN :
L_Amygdala_mean 26 0 NaN :
L_Amygdala_numvoxels 26 0 NaN :
L_Amygdala_SD 26 0 NaN :
L_Accumbens_mean 26 0 NaN :
L_Accumbens_numvoxels 26 0 NaN :
L_Accumbens_SD 26 0 NaN :
R_Putamen_mean 26 0 NaN :
R_Putamen_numvoxels 26 0 NaN :
R_Putamen_SD 26 0 NaN :
R_Pallidum_mean 26 0 NaN :
R_Pallidum_numvoxels 26 0 NaN :
R_Pallidum_SD 26 0 NaN :
R_Amygdala_mean 26 0 NaN :
R_Amygdala_numvoxels 26 0 NaN :
R_Amygdala_SD 26 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
L_CerebralWM_mean 25 0.04 2.080000e+00 NA 2.08 2.08 2.08 2.08 2.080e+00 ▁▁▇▁▁
L_CerebralWM_numvoxels 25 0.04 3.450000e+02 NA 345.00 345.00 345.00 345.00 3.450e+02 ▁▁▇▁▁
L_CerebralWM_SD 25 0.04 5.480000e+00 NA 5.48 5.48 5.48 5.48 5.480e+00 ▁▁▇▁▁
L_CerebralCortex_mean 17 0.35 7.710000e+00 3.380000e+00 1.93 6.52 6.90 8.19 1.280e+01 ▂▂▇▁▃
L_CerebralCortex_numvoxels 17 0.35 4.086700e+02 2.975700e+02 5.00 201.00 327.00 603.00 8.500e+02 ▅▇▂▂▅
L_CerebralCortex_SD 17 0.35 2.330000e+00 1.010000e+00 0.90 1.69 2.09 3.17 3.870e+00 ▇▇▇▃▇
L_LatVent_mean 25 0.04 3.890000e+00 NA 3.89 3.89 3.89 3.89 3.890e+00 ▁▁▇▁▁
L_LatVent_numvoxels 25 0.04 4.100000e+01 NA 41.00 41.00 41.00 41.00 4.100e+01 ▁▁▇▁▁
L_LatVent_SD 25 0.04 1.730000e+00 NA 1.73 1.73 1.73 1.73 1.730e+00 ▁▁▇▁▁
L_Thalamus_mean 25 0.04 8.260000e+00 NA 8.26 8.26 8.26 8.26 8.260e+00 ▁▁▇▁▁
L_Thalamus_numvoxels 25 0.04 6.800000e+01 NA 68.00 68.00 68.00 68.00 6.800e+01 ▁▁▇▁▁
L_Thalamus_SD 25 0.04 2.760000e+00 NA 2.76 2.76 2.76 2.76 2.760e+00 ▁▁▇▁▁
BrainStem_mean 0 1.00 6.700000e+00 2.050000e+00 2.86 5.40 6.33 8.64 1.052e+01 ▃▇▅▇▃
BrainStem_numvoxels 0 1.00 6.918800e+02 2.264900e+02 284.00 584.50 657.50 881.75 1.000e+03 ▃▁▆▂▇
Brainstem_SD 0 1.00 3.020000e+00 1.140000e+00 1.44 1.96 2.97 3.83 5.560e+00 ▇▅▅▅▁
R_CerebralWM_mean 0 1.00 5.280000e+00 1.280000e+00 1.23 4.80 5.47 5.95 7.280e+00 ▁▁▃▇▃
R_CerebralWM_numvoxels 0 1.00 1.121420e+03 5.192600e+02 373.00 637.25 989.00 1584.75 2.053e+03 ▇▆▃▃▅
R_CerebralWM_SD 0 1.00 2.750000e+00 8.100000e-01 1.53 2.06 2.77 3.33 4.350e+00 ▇▂▇▃▂
R_CerebralCortex_mean 0 1.00 6.620000e+00 1.740000e+00 1.83 5.97 6.33 8.14 8.870e+00 ▂▁▃▇▇
R_CerebralCortex_numvoxels 0 1.00 5.082150e+03 1.513020e+03 2881.00 3937.50 4775.00 6032.25 8.512e+03 ▇▇▃▃▃
R_CerebralCortex_SD 0 1.00 2.160000e+00 6.300000e-01 1.31 1.71 2.00 2.53 3.560e+00 ▇▇▇▁▃
R_LatVent_mean 0 1.00 3.750000e+00 1.780000e+00 -1.31 2.98 3.74 4.90 6.760e+00 ▁▁▅▇▂
R_LatVent_numvoxels 0 1.00 5.423000e+01 3.925000e+01 0.00 29.00 43.50 80.00 1.680e+02 ▇▇▅▂▁
R_LatVent_S 0 1.00 2.310000e+00 8.500000e-01 0.00 1.84 2.37 2.92 3.800e+00 ▂▁▇▇▃
R_Thalamus_mean 0 1.00 6.980000e+00 1.940000e+00 2.31 6.03 6.85 8.04 1.101e+01 ▁▃▇▅▂
R_Thalamus_numvoxels 0 1.00 2.887700e+02 1.276000e+02 2.00 228.25 293.00 385.75 4.840e+02 ▃▂▇▆▇
R_Thalamus_SD 0 1.00 2.120000e+00 6.800000e-01 0.71 1.73 2.06 2.56 3.500e+00 ▂▇▆▇▂
R_Caudate_mean 13 0.50 5.230000e+00 2.720000e+00 0.00 4.47 5.72 7.38 8.400e+00 ▃▁▆▇▇
R_Caudate_numvoxels 13 0.50 6.238000e+01 5.778000e+01 0.00 17.00 63.00 85.00 1.950e+02 ▇▅▅▂▂
R_Caudate_SD 13 0.50 7.692308e+28 2.773501e+29 0.00 1.03 1.68 2.03 1.000e+30 ▇▁▁▁▁
R_Hipp_mean 25 0.04 0.000000e+00 NA 0.00 0.00 0.00 0.00 0.000e+00 ▁▁▇▁▁
R_Hipp_numvoxels 25 0.04 0.000000e+00 NA 0.00 0.00 0.00 0.00 0.000e+00 ▁▁▇▁▁
R_Hipp_SD 25 0.04 0.000000e+00 NA 0.00 0.00 0.00 0.00 0.000e+00 ▁▁▇▁▁
R_Accumbens_mean 9 0.65 6.640000e+00 2.160000e+00 1.63 5.94 6.84 7.81 1.081e+01 ▁▂▇▅▃
R_Accumbens_numvoxels 9 0.65 2.906000e+01 2.147000e+01 1.00 9.00 32.00 43.00 7.000e+01 ▇▁▅▃▂
R_Accumbens_SD 9 0.65 5.882353e+28 2.425356e+29 0.00 0.78 1.09 1.53 1.000e+30 ▇▁▁▁▁
#clean up a bit
cest.vals <- cest.rois %>%
 dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
  mutate(scanid = str_split(as.character(Subject), "/", simplify = T)[, 6]) %>% #get BBLID_scanID 
 dplyr::select(-Subject)

#GM voxels
cest.gm.rois <- read.csv("data/GM-HarvardOxford-Subcortical-Measures.csv", header = TRUE, stringsAsFactors = F, sep = '\t', na="NaN")

#clean up a bit
gm.vals <- cest.gm.rois %>%
 dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
  mutate(scanid = str_split(as.character(Subject), "/", simplify = T)[, 6]) %>% #get BBLID_scanID 
 dplyr::select(-Subject) %>%
  rename_with(.cols = 1:(ncol(.)-1), function(x){paste0("gm.", x)}) #add prefix

ALL SCANS HAVE RIGHT THALAMUS DATA!

one subject also weirdly has GM values in L_CerebralWM_mean, outputting scan ID so I can look at the niftis

gm.vals %>% filter(!is.na(gm.L_CerebralWM_numvoxels)) %>%dplyr::select(scanid, gm.L_CerebralWM_numvoxels)
##        scanid gm.L_CerebralWM_numvoxels
## 1 94276_10927                       106

Looks like slab was placed across the midline, seeing if other scans have comparable gm voxels in R_CerebralWM_mean as comparison.

gm.vals %>% filter(!is.na(gm.R_CerebralWM_numvoxels)) %>%dplyr::select(scanid, gm.R_CerebralWM_numvoxels)
##          scanid gm.R_CerebralWM_numvoxels
## 1   105176_9332                       384
## 2   106573_8900                       720
## 3   112126_8979                       906
## 4  117397_10686                       582
## 5   120217_9029                       694
## 6   121000_8859                       583
## 7   121407_8987                       547
## 8  125073_10972                       293
## 9  132179_10918                       151
## 10  132179_9726                       199
## 11  139272_8985                       257
## 12   18093_8924                       783
## 13  19790_10934                       333
## 14   83835_8777                       300
## 15   85369_9132                       542
## 16   87225_9459                       411
## 17  91919_10683                       327
## 18  91962_11090                       315
## 19   92886_9087                       483
## 20   93242_9146                       516
## 21  93274_10992                       297
## 22   94028_9203                      1029
## 23  94276_10927                       249
## 24  96659_11096                       256
## 25  97994_11114                       215
## 26   99949_9031                       284

Seems like having errant GM voxels (calculated probablistically via FAST) isn’t uncommon in WM (as defined by HO Subcortical atlas), what makes 94276_10927 unique is slab orientation.

Merging cest values, remove wonky mOFC data and filter to thalamic ROI

all.cest <- merge(cest.vals, gm.vals, by="scanid") %>%
  filter(!scanid %in% c("105176_9332", "132179_9726", "92886_9087", "87225_9459", "94028_9203"))

sum(duplicated(all.cest$BBLID)) #no duplicate subjects remain
## [1] 0
r.thal.dat <- all.cest %>%dplyr::select("scanid" | contains("R_Thalamus"))
skim(r.thal.dat)
Data summary
Name r.thal.dat
Number of rows 21
Number of columns 7
_______________________
Column type frequency:
character 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
scanid 0 1 10 12 0 21 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
R_Thalamus_mean 0 1 7.48 1.67 5.14 6.07 7.12 8.10 11.01 ▇▇▇▂▂
R_Thalamus_numvoxels 0 1 319.38 111.65 80.00 242.00 340.00 418.00 484.00 ▂▃▇▇▇
R_Thalamus_SD 0 1 2.27 0.58 1.45 1.79 2.28 2.61 3.50 ▇▁▆▂▂
gm.R_Thalamus_mean 0 1 0.62 0.10 0.40 0.56 0.62 0.66 0.86 ▁▃▇▃▁
gm.R_Thalamus_numvoxels 0 1 223.14 71.34 126.00 160.00 221.00 300.00 341.00 ▇▅▇▁▇
gm.R_Thalamus_SD 0 1 0.35 0.03 0.23 0.35 0.35 0.37 0.39 ▁▁▂▇▅
r.thal.dat %>%dplyr::select(-scanid) %>% ggpairs()

#check for correlation of thalamic volume captured and avg. CEST
cor.test(r.thal.dat$R_Thalamus_mean, r.thal.dat$R_Thalamus_numvoxels, method=c("pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  r.thal.dat$R_Thalamus_mean and r.thal.dat$R_Thalamus_numvoxels
## t = 1.794, df = 19, p-value = 0.08875
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06113673  0.69765788
## sample estimates:
##       cor 
## 0.3805947

Final Dataframe

Merging into other FINAL DATASET

clin.to.merge <- sops_change_meds.life %>%
  mutate(scanid=paste(BBLID, SCANID, sep="_"))

final.df <- right_join(clin.to.merge, r.thal.dat, by="scanid")
n_unique(final.df$observation)
## [1] 48
n_unique(final.df$BBLID)
## [1] 21
#scaled dataframe
final.df_scaled <- final.df %>% mutate_if(is.numeric, scale)

#baseline only
final.base <- final.df %>% filter(scan.time=="base.1") %>%
  mutate(sex=recode_factor(Sex_1M, `1`="Male", `2`="Female"))

#followups only 
final.end <- final.df %>% filter(scan.time != "base.1")

#followups only - scaled
final.end_scaled <- final.end %>% mutate_if(is.numeric, scale)

EDA

Plotting correlations

skim(final.df)
Data summary
Name final.df
Number of rows 48
Number of columns 68
_______________________
Column type frequency:
character 11
Date 3
factor 8
logical 4
numeric 42
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
BBLID 0 1.00 5 6 0 21 0
scanner 0 1.00 3 5 0 2 0
SCANID 0 1.00 4 5 0 21 0
PROTOCOL 0 1.00 12 25 0 11 0
SPD 8 0.83 1 1 0 1 0
FHPI 14 0.71 1 1 0 2 0
FIRST_DEG_SCZ 16 0.67 1 1 0 2 0
scan.time 0 1.00 5 6 0 5 0
observation 0 1.00 22 24 0 48 0
AGEONSET_CLINICRISK 38 0.21 1 2 0 10 0
scanid 0 1.00 10 12 0 21 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date7t 0 1.00 2014-03-07 2018-12-18 2014-10-27 20
DOSIPS 0 1.00 2014-01-08 2022-02-18 2018-06-10 48
DODIAGNOSIS 2 0.96 2014-01-08 2022-02-18 2018-06-10 46

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
TYPE 0 1.00 FALSE 2 Pro: 43, Com: 5, Col: 0
Race 0 1.00 FALSE 4 Bla: 33, Whi: 11, Asi: 2, Mix: 2
Ethnicity 0 1.00 FALSE 1 2: 48, 1: 0
diagnosis 2 0.96 FALSE 4 pro: 24, psy: 10, non: 9, oth: 3
base_dx 0 1.00 FALSE 4 pro: 19, non: 16, psy: 11, oth: 2
sops_cat 0 1.00 FALSE 3 sta: 25, pro: 13, rem: 10
psych.rx_life 0 1.00 FALSE 2 0: 39, 1: 9
base_rx 0 1.00 FALSE 2 0: 40, 1: 8

Variable type: logical

skim_variable n_missing complete_rate mean count
dx_control 0 1 0.19 FAL: 39, TRU: 9
dx_pro 0 1 0.50 FAL: 24, TRU: 24
dx_psych 0 1 0.21 FAL: 38, TRU: 10
dx_other 0 1 0.33 FAL: 32, TRU: 16

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
P1 0 1.00 2.02 2.03 0.00 0.00 2.00 3.25 6.00 ▇▅▁▁▃
P2 0 1.00 1.38 1.65 0.00 0.00 1.00 2.00 6.00 ▇▂▁▁▁
P3 0 1.00 0.83 1.40 0.00 0.00 0.00 1.25 6.00 ▇▁▁▁▁
P4 0 1.00 1.52 1.97 0.00 0.00 0.00 3.00 6.00 ▇▁▂▁▁
P5 0 1.00 1.23 1.64 0.00 0.00 0.00 2.00 6.00 ▇▁▂▁▁
N1 0 1.00 0.94 1.17 0.00 0.00 1.00 2.00 5.00 ▇▂▁▁▁
N2 0 1.00 0.65 1.10 0.00 0.00 0.00 1.00 4.00 ▇▁▁▁▁
N3 0 1.00 0.60 0.92 0.00 0.00 0.00 1.00 3.00 ▇▂▁▁▁
N4 0 1.00 0.65 1.38 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
N5 0 1.00 0.50 1.03 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
N6 0 1.00 1.15 2.16 0.00 0.00 0.00 1.00 6.00 ▇▁▁▁▂
D1 0 1.00 0.50 1.09 0.00 0.00 0.00 0.00 5.00 ▇▁▁▁▁
D2 0 1.00 0.75 1.59 0.00 0.00 0.00 0.00 6.00 ▇▁▁▁▁
D3 1 0.98 0.85 1.20 0.00 0.00 0.00 2.00 4.00 ▇▂▂▂▁
D4 0 1.00 0.27 0.74 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
G1 0 1.00 0.96 1.61 0.00 0.00 0.00 1.25 6.00 ▇▁▂▁▁
G2 0 1.00 0.98 1.60 0.00 0.00 0.00 2.00 6.00 ▇▂▁▁▁
G3 0 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
G4 0 1.00 0.85 1.27 0.00 0.00 0.00 1.00 5.00 ▇▁▁▁▁
GAF_C 15 0.69 65.79 21.25 21.00 50.00 71.00 84.00 94.00 ▁▅▂▃▇
GAF_H 21 0.56 69.04 19.43 21.00 54.00 71.00 84.00 94.00 ▁▃▃▂▇
GAF_PCTCHG 21 0.56 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▁▁▇▁▁
sops_p 0 1.00 6.98 7.21 0.00 1.75 5.00 10.50 29.00 ▇▂▂▁▁
sops_n 0 1.00 4.48 4.53 0.00 1.00 3.50 7.25 21.00 ▇▅▁▁▁
sops_d 1 0.98 2.38 3.17 0.00 0.00 1.00 3.50 13.00 ▇▃▁▁▁
sops_g 0 1.00 2.79 3.72 0.00 0.00 1.00 4.25 12.00 ▇▁▁▁▁
sops_tot 1 0.98 16.72 16.62 0.00 4.50 12.00 26.00 74.00 ▇▃▁▁▁
sips_diff_days 0 1.00 714.17 878.28 -250.00 -37.75 515.50 1024.25 2799.00 ▇▆▂▁▂
sips_diff_yrs 0 1.00 1.95 2.41 -0.50 0.00 1.50 2.75 7.50 ▇▅▂▁▂
Sex_1M 0 1.00 1.69 0.47 1.00 1.00 2.00 2.00 2.00 ▃▁▁▁▇
age_scan 0 1.00 20.03 2.66 16.26 17.24 19.95 22.23 25.09 ▇▃▆▅▅
age_sops 0 1.00 21.98 3.08 16.99 19.84 21.79 23.59 29.42 ▅▇▇▂▂
dx_sops_diff 2 0.96 0.02 1.58 -7.00 0.00 0.00 0.00 8.00 ▁▁▇▁▁
base_gafc 2 0.96 68.39 20.86 21.00 50.00 78.00 87.00 94.00 ▁▅▂▂▇
base_sops_p 0 1.00 6.50 7.51 0.00 1.00 4.00 9.00 29.00 ▇▂▂▁▁
delta_sops 0 1.00 0.48 4.68 -13.00 0.00 0.00 2.00 15.00 ▁▁▇▂▁
R_Thalamus_mean 0 1.00 7.31 1.66 5.14 6.05 7.05 8.10 11.01 ▇▆▆▂▂
R_Thalamus_numvoxels 0 1.00 302.98 118.71 80.00 238.00 305.00 398.50 484.00 ▅▃▇▇▇
R_Thalamus_SD 0 1.00 2.25 0.54 1.45 1.79 2.23 2.59 3.50 ▇▂▆▂▂
gm.R_Thalamus_mean 0 1.00 0.62 0.09 0.40 0.56 0.60 0.66 0.86 ▁▅▇▃▁
gm.R_Thalamus_numvoxels 0 1.00 216.10 69.30 126.00 159.25 196.00 265.50 341.00 ▇▃▅▁▆
gm.R_Thalamus_SD 0 1.00 0.35 0.03 0.23 0.35 0.35 0.37 0.39 ▁▁▁▇▅
final.df %>% 
 dplyr::select_if(is.numeric) %>%
  cor(., use = "pairwise.complete.obs") %>%
  corrplot(method = "color", 
           type = "upper",
           addCoef.col = "black", # Add coefficient of correlation
           tl.col="black", tl.srt = 45, #Text label color and rotation
           # hide correlation coefficient on the principal diagonal
           diag = FALSE 
           )
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is zero

Some basic demographic info on the entire sample:

summary(final.df$age_sops)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.99   19.84   21.79   21.98   23.59   29.42
summary(final.df$age_scan)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.26   17.24   19.95   20.03   22.23   25.09
p.age_scan <- final.df %>%
  ggplot() + 
  geom_histogram(aes(x = age_scan), fill="cadetblue4") +
  ggtitle("Age at Scan")

p.age_sops <- final.df %>%
  ggplot() + 
  geom_histogram(aes(x = age_sops), fill="darkslateblue") +
  ggtitle("Age at SOPS")

age_plot <- ggarrange(p.age_scan, p.age_sops, common.legend = F, legend = "bottom", ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
annotate_figure(p=age_plot, top = text_grob("Age of Participants at Scan and Clinical Assessments", 
                               color = "black", face = "bold", size = 14))

Visualize spread of followup timepoints for possible future binning:

final.end %>%
  ggplot() + 
  geom_histogram(aes(x = sips_diff_days), fill="darkseagreen4", color="white", bins=10) +
  ggtitle("Time to Clinical Follow-Up")

#10 looks like good # of bins
final.bin_scaled <- final.end %>%
  mutate(sips_diff_bin=ntile(sips_diff_days, n=10)) %>%
  mutate_if(is.numeric, scale)

Between-Scanner Differences

Baseline Demographics and Observation Counts

demo <- final.df %>% filter(scan.time=="base.1") %>%
  group_by(scanner) %>%
  summarise(n = n(),
            age=mean(age_scan),
            age_sd=sd(age_scan))

demo.base <- atable(final.base, 
         target_cols=c("sex", "age_scan", "Race", "Ethnicity", "base_dx", "TYPE", "base_sops_p", "base_gafc", "psych.rx_life"),
         group_col="scanner",
         format_to="HTML")
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in two_sample_htest.factor(value, group, ...): Not enough values.
## Returning p-value=NaN..
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
tpt.table <- atable(final.df,
                    target_cols=c("scan.time", "age_sops", "sips_diff_yrs", "diagnosis", "sops_p", "sops_n", "sops_d", "sops_g", "GAF_C", "psych.rx_life"),
                    group_col="scanner",
                    format_to="HTML")
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties
## Warning in stats::chisq.test(group, value): Chi-squared approximation may be
## incorrect

CEST Values

Compare Thalamic CEST data by scanner:

final.df %>%dplyr::select(scanner | contains("Thalamus")) %>% ggpairs(aes(color=scanner))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cest.table <- atable(final.base,
       target_cols=c("R_Thalamus_mean", "R_Thalamus_numvoxels", "gm.R_Thalamus_mean", "gm.R_Thalamus_numvoxels"),
       group_col="scanner",
       format_to = "HTML")
## Warning in stats::ks.test(x, y, alternative = c("two.sided"), ...): cannot
## compute exact p-value with ties

Significance Testing

Are CEST values or demographics significantly different between scanners?

#check for differences in baseline diagnoses
chisq.test(final.base$scanner, final.base$base_dx, simulate.p.value=TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  final.base$scanner and final.base$base_dx
## X-squared = 2.9448, df = NA, p-value = 0.4913
#check for differences in scan age
chisq.test(final.base$scanner, final.base$age_scan, simulate.p.value=TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  final.base$scanner and final.base$age_scan
## X-squared = 21, df = NA, p-value = 1
#check for differences in baseline SOPS Positive
t.test(base_sops_p ~ scanner, data=final.base)
## 
##  Welch Two Sample t-test
## 
## data:  base_sops_p by scanner
## t = 0.24971, df = 18.875, p-value = 0.8055
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
##  -6.512717  8.276353
## sample estimates:
##   mean in group ONM mean in group Terra 
##            7.181818            6.300000
#check for differences in thalamic CEST
t.test(R_Thalamus_mean ~ scanner, data = final.base)
## 
##  Welch Two Sample t-test
## 
## data:  R_Thalamus_mean by scanner
## t = -4.0043, df = 13.984, p-value = 0.001308
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
##  -3.432339 -1.037787
## sample estimates:
##   mean in group ONM mean in group Terra 
##            6.411717            8.646781
#check for differences in thalamic volume captured
t.test(R_Thalamus_numvoxels ~ scanner, data = final.base)
## 
##  Welch Two Sample t-test
## 
## data:  R_Thalamus_numvoxels by scanner
## t = -2.2206, df = 17.739, p-value = 0.03966
## alternative hypothesis: true difference in means between group ONM and group Terra is not equal to 0
## 95 percent confidence interval:
##  -194.109892   -5.271926
## sample estimates:
##   mean in group ONM mean in group Terra 
##            271.9091            371.6000
#look for correlations between volume and CEST when controlling for scan
m.cest.vol <- lm(R_Thalamus_mean ~ scanner + R_Thalamus_numvoxels, data=final.base)
summary(m.cest.vol)
## 
## Call:
## lm(formula = R_Thalamus_mean ~ scanner + R_Thalamus_numvoxels, 
##     data = final.base)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6310 -0.5864 -0.1908  0.6115  2.4582 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.066894   0.868308   6.987 1.59e-06 ***
## scannerTerra         2.108639   0.624989   3.374  0.00338 ** 
## R_Thalamus_numvoxels 0.001268   0.002865   0.443  0.66328    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.272 on 18 degrees of freedom
## Multiple R-squared:  0.4761, Adjusted R-squared:  0.4179 
## F-statistic:  8.18 on 2 and 18 DF,  p-value: 0.002971
Anova(m.cest.vol)
## Anova Table (Type II tests)
## 
## Response: R_Thalamus_mean
##                       Sum Sq Df F value   Pr(>F)   
## scanner              18.4271  1  11.383 0.003381 **
## R_Thalamus_numvoxels  0.3172  1   0.196 0.663281   
## Residuals            29.1386 18                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Scanner is not significantly related to baseline diagnosis, baseline SOPS positive score, or age at scan but is significantly related to CEST contrast (averaged over thalamus) and number of thalamic volumes captured. After controlling for scanner, number of voxels is not significantly associated with mean CEST value in the thalamus, therefore scanner (but not volume) will be controlled for in the final model.

Clinical Values

Look at correlations between lifetime psych rx and diagnosis - unsurprisingly these are highly correlated values

kruskal.test(final.df$psych.rx_life~final.df$diagnosis)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  final.df$psych.rx_life by final.df$diagnosis
## Kruskal-Wallis chi-squared = 24.175, df = 3, p-value = 2.296e-05
ggplot(final.df) + 
  geom_count(aes(x = factor(diagnosis), y = as.factor(psych.rx_life))) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ggtitle("SOPS Total Score vs Axis 1 Dx")

# final.df %>% 
#   group_by(BBLID) %>%
#   sample_n(1) %>% #randomdplyr::selection
#   ggplot(aes(x = diagnosis, y = GAF_C)) +
#   geom_point(aes(color=psych.rx_life), show.legend = TRUE) +
#   geom_jitter(width = 0.25) +
#   theme(axis.text.x = element_text(angle = 45, hjust=1)) +
#   ggtitle("GAF vs Axis 1 Dx")

final.df %>% 
  group_by(BBLID) %>%
  ggplot(aes(x = diagnosis, y = sops_p, color=psych.rx_life)) +
  geom_point(aes(color=psych.rx_life), show.legend = TRUE) +
  geom_jitter(width = 0.25) +
  theme(axis.text.x = element_text(angle = 45, hjust=1)) +
  ggtitle("SOPS Positive Score vs Axis 1 Dx")

t.test(sops_p ~ psych.rx_life, data = final.df)
## 
##  Welch Two Sample t-test
## 
## data:  sops_p by psych.rx_life
## t = -4.0896, df = 9.0766, p-value = 0.002671
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -18.722205  -5.397453
## sample estimates:
## mean in group 0 mean in group 1 
##        4.717949       16.777778
final.df %>%
  group_by(BBLID, SCANID) %>%
  mutate(base_rx=first(psych.rx_life)) %>%
  t.test(sops_p ~ base_rx, data = .)
## 
##  Welch Two Sample t-test
## 
## data:  sops_p by base_rx
## t = -4.719, df = 7.9491, p-value = 0.00153
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -20.141533  -6.908467
## sample estimates:
## mean in group 0 mean in group 1 
##           4.725          18.250

Higher SOPS P scores are predictably related to psychosis diagnosis and psychiatric medication.

Look at CEST data by diagnosis and diagnostic category:

#plots
final.base %>% dplyr::select(base_dx | contains("Thalamus")) %>% 
  filter(base_dx != "other") %>% #too few other dx, need to remove
ggpairs(aes(color=base_dx)) +
  ggtitle("Thalamic Cest Values by Baseline Diagnosis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

final.base %>%
 dplyr::select(dx_control | contains("Thalamus")) %>% 
  mutate(dx_control=recode_factor(as.factor(dx_control), 'TRUE'="Control", 'FALSE'="Patient")) %>%
  ggpairs(aes(color=dx_control)) +
  ggtitle("Thalamic Cest Values: Patients vs Controls")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#linear model
lm.dx <- lm(R_Thalamus_mean ~ scanner + base_dx + Sex_1M + age_scan, data=final.base)
summary(lm.dx)
## 
## Call:
## lm(formula = R_Thalamus_mean ~ scanner + base_dx + Sex_1M + age_scan, 
##     data = final.base)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23978 -0.72815 -0.03797  0.51294  2.21595 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    2.6077     2.6598   0.980  0.34352   
## scannerTerra   2.4059     0.6405   3.756  0.00213 **
## base_dxother  -2.2145     1.6081  -1.377  0.19012   
## base_dxpro    -0.9952     0.7804  -1.275  0.22301   
## base_dxpsych  -0.2657     0.8451  -0.314  0.75788   
## Sex_1M         0.3039     0.6656   0.457  0.65494   
## age_scan       0.1841     0.1298   1.418  0.17804   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.279 on 14 degrees of freedom
## Multiple R-squared:  0.5885, Adjusted R-squared:  0.4121 
## F-statistic: 3.337 on 6 and 14 DF,  p-value: 0.02961
Anova(lm.dx)
## Anova Table (Type II tests)
## 
## Response: R_Thalamus_mean
##            Sum Sq Df F value   Pr(>F)   
## scanner   23.0664  1 14.1081 0.002128 **
## base_dx    4.7503  3  0.9685 0.435121   
## Sex_1M     0.3409  1  0.2085 0.654936   
## age_scan   3.2879  1  2.0110 0.178038   
## Residuals 22.8896 14                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Overall, baseline diagnosis does not appear to be significantly associated with thalamic cest when controlling for sex, age and scanner type, while scanner type does remain predictive.

Looking at baseline diagnosis vs GAF

ggplot(final.df, aes(x=sips_diff_days, y=GAF_C, color=base_dx)) +
  geom_point() +
  xlab("Time from Scan") +
  ylab("GAF") + 
  ggtitle("GAF Progression by for Each Dx Category") + 
  geom_smooth(method="lm") +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).

sum(is.na(final.df$GAF_C))
## [1] 15
ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=base_dx)) +
  geom_point() +
  xlab("Time from Scan") +
  ylab("SOPS Positive Score") + 
  ggtitle("SOPS-P Progression by for Each Dx Category") + 
  geom_smooth(method="lm") +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

All groups appear to have relatively stable SOPS-P scores when averaged across individuals. But can CEST values predict and individual’s change?

# ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=BBLID)) +
#   geom_point() +
#   xlab("Time from Scan") +
#   ylab("SOPS Positive Score") + 
#   ggtitle("SOPS-P Progression by for Each Subject") + 
#   geom_smooth(method="lm",se = FALSE) +
#   theme_bw()

ggplot(final.df, aes(x=sips_diff_days, y=sops_p, color=BBLID)) +
  geom_line() +
  xlab("Time from Scan") +
  ylab("SOPS Positive Score") + 
  ggtitle("SOPS-P Progression by for Each Subject") + 
  geom_smooth(method="lm",se = FALSE) +
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

#plot of change by dx
final.end_scaled %>%
  ggplot() + 
  geom_histogram(aes(x = delta_sops, fill=base_dx, color=base_dx), position="dodge", alpha=0.5) +
  ggtitle("Change in SOPS Positive Score by Baseline Diagnosis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot change in sops positive by glutamate (similar to egerton et al fig 2)

final.end_scaled %>%
  filter(base_dx=="pro"| base_dx=="psych")
## # A tibble: 17 × 68
##    BBLID  date7t     scanner SCANID PROTOCOL     DOSIPS     TYPE   P1[,1] P2[,1]
##    <chr>  <date>     <chr>   <chr>  <chr>        <date>     <fct>   <dbl>  <dbl>
##  1 93274  2018-10-23 Terra   10992  833922 - Ev… 2019-11-14 Prob…  1.96    1.60 
##  2 99949  2014-08-20 ONM     9031   818964 - TI… 2015-11-05 Prob… -1.06   -0.902
##  3 94276  2018-09-07 Terra   10927  829502 - MO… 2020-02-04 Prob…  0.447   1.60 
##  4 18093  2014-06-10 ONM     8924   810336 - Go3 2015-11-08 Prob…  0.951  -0.278
##  5 139272 2014-07-22 ONM     8985   810336 - Go3 2016-01-07 Prob… -0.0559 -0.902
##  6 85369  2014-10-27 ONM     9132   810336 - TI… 2016-04-19 Prob… -1.06    0.347
##  7 99949  2014-08-20 ONM     9031   810336 - GO… 2016-03-19 Prob… -1.06   -0.278
##  8 117397 2018-02-01 Terra   10686  833922 - Ev… 2019-12-05 Prob… -0.559   0.347
##  9 91962  2018-11-30 Terra   11090  829502 - MO… 2020-10-26 Prob…  1.45    1.60 
## 10 99949  2014-08-20 ONM     9031   818964 - TI… 2016-10-19 Prob… -1.06   -0.278
## 11 132179 2018-08-15 Terra   10918  829502 - MO… 2020-12-09 Prob… -0.0559  0.347
## 12 91919  2018-01-31 Terra   10683  829502 - MO… 2020-08-12 Prob… -1.06    0.347
## 13 125073 2018-10-11 Terra   10972  833922 - Ev… 2021-05-13 Prob…  0.951   0.971
## 14 139272 2014-07-22 ONM     8985   16-013305 -… 2018-02-17 Prob… -0.0559 -0.902
## 15 120217 2014-08-19 ONM     9029   16-013305 -… 2019-02-22 Prob…  1.96    2.84 
## 16 99949  2014-08-20 ONM     9031   833922 - Ev… 2021-06-01 Prob… -0.0559 -0.278
## 17 106573 2014-05-22 ONM     8900   843329 - Lo… 2022-01-19 Prob…  1.45    0.347
## # … with 59 more variables: P3 <dbl[,1]>, P4 <dbl[,1]>, P5 <dbl[,1]>,
## #   N1 <dbl[,1]>, N2 <dbl[,1]>, N3 <dbl[,1]>, N4 <dbl[,1]>, N5 <dbl[,1]>,
## #   N6 <dbl[,1]>, D1 <dbl[,1]>, D2 <dbl[,1]>, D3 <dbl[,1]>, D4 <dbl[,1]>,
## #   G1 <dbl[,1]>, G2 <dbl[,1]>, G3 <dbl[,1]>, G4 <dbl[,1]>, GAF_C <dbl[,1]>,
## #   GAF_H <dbl[,1]>, GAF_PCTCHG <dbl[,1]>, SPD <chr>, FHPI <chr>,
## #   FIRST_DEG_SCZ <chr>, sops_p <dbl[,1]>, sops_n <dbl[,1]>, sops_d <dbl[,1]>,
## #   sops_g <dbl[,1]>, sops_tot <dbl[,1]>, sips_diff_days <dbl[,1]>, …
ggplot(data = final.end_scaled, mapping = aes(x = R_Thalamus_mean, y = delta_sops)) +
  geom_point(na.rm = T, aes(col = diagnosis)) +
  geom_smooth(method = "lm", na.rm=T, col = "black", se=T) +
  theme(legend.position = "right") +
  labs(title="Thalamic Glutamate by Change in SOPS Positive \n Among Prodrome and Psychosis Pts (Scaled)",
        x ="Thalamic Glutamate Contrast", y = "Change in SOPS Positive", color="Endpoint Diagnosis")
## `geom_smooth()` using formula 'y ~ x'

The plot is vaguely suggestive of that in Egerton et al, but clearly not significant. However, these are scaled raw values while their plot reported “Baseline thalamic glutamate level, adjusted for baseline attenuated positive symptom score”.

Modeling

Plotting simple linear model for visualization.

ggplot(data = final.df, mapping = aes(x = sips_diff_days, y = sops_p)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_smooth(method = "lm", na.rm = T, col = "black", se = F) +
  theme(legend.position = "right")
## `geom_smooth()` using formula 'y ~ x'

At the population level, SOPS Positive scores remained stable over time.

Lm to look for any interactions of thalamic CEST and diagnosis:

lm.int <- lm(sops_p ~ R_Thalamus_mean*base_dx, data=final.df)
Anova(lm.int)
## Note: model has aliased coefficients
##       sums of squares computed by model comparison
## Anova Table (Type II tests)
## 
## Response: sops_p
##                          Sum Sq Df F value    Pr(>F)    
## R_Thalamus_mean            9.96  1  0.2992    0.5874    
## base_dx                 1034.24  3 10.3536 3.353e-05 ***
## R_Thalamus_mean:base_dx    0.59  2  0.0089    0.9912    
## Residuals               1365.18 41                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm.int2 <- lm(sops_p ~ R_Thalamus_mean*R_Thalamus_numvoxels, data=final.df)
Anova(lm.int2)
## Anova Table (Type II tests)
## 
## Response: sops_p
##                                       Sum Sq Df F value Pr(>F)
## R_Thalamus_mean                        11.38  1  0.2152 0.6450
## R_Thalamus_numvoxels                   26.79  1  0.5068 0.4803
## R_Thalamus_mean:R_Thalamus_numvoxels   46.93  1  0.8876 0.3513
## Residuals                            2326.29 44
lm.int3 <- lm(sops_p ~ R_Thalamus_mean*sips_diff_days, data=final.df)
Anova(lm.int3)
## Anova Table (Type II tests)
## 
## Response: sops_p
##                                 Sum Sq Df F value Pr(>F)
## R_Thalamus_mean                  42.15  1  0.7758 0.3832
## sips_diff_days                    0.02  1  0.0003 0.9854
## R_Thalamus_mean:sips_diff_days    9.80  1  0.1803 0.6731
## Residuals                      2390.19 44

No evidence of meaningful interactions that should be included in the final model.

Mixed Effects Models of SOPS P Change

Using scaled data so model will converge more readily. No need for random slopes, since each subject’s delta_sops allows for varying clinical trajectories all on their own. Slope in this case will be analogous to the relationship between cest and improvement, which we hope will be the same/generalizable across subjects, rather than between cest and scores.

fit.delta.1 <- lmer(delta_sops ~ sips_diff_days + (1|BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ sips_diff_days + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 69.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.60842 -0.15693 -0.03706  0.22819  1.57359 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.6808   0.8251  
##  Residual             0.2272   0.4767  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error      df t value Pr(>|t|)  
## (Intercept)      0.0742     0.2048 19.1533   0.362   0.7211  
## sips_diff_days   0.3455     0.1538 15.5793   2.247   0.0395 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## sps_dff_dys 0.029
#plot
delta1_coefs <- coef(fit.delta.1)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
  rownames_to_column("BBLID")

model.end.1.data <- left_join(final.end_scaled, delta1_coefs, by="BBLID")

ggplot(data = model.end.1.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right")

Adding baseline SOPS P to model

fit.delta.2 <- lmer(delta_sops ~ sips_diff_days + base_sops_p + (1|BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ sips_diff_days + base_sops_p + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 67
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.66332 -0.26629 -0.01428  0.13924  1.50440 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.5487   0.7408  
##  Residual             0.2231   0.4723  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept)     0.09076    0.18850 18.19000   0.481   0.6359  
## sips_diff_days  0.25698    0.15349 15.11621   1.674   0.1146  
## base_sops_p    -0.40231    0.18643 20.93004  -2.158   0.0427 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) sps_d_
## sps_dff_dys  0.014       
## base_sops_p -0.045  0.289
# #plot
# delta2_coefs <- coef(fit.delta.2)$BBLID %>% 
#   rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
#   rownames_to_column("BBLID")
# 
# model.end.2.data <- left_join(final.bin_scaled, delta2_coefs, by="BBLID")
# 
# ggplot(data = model.end.2.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
#   geom_point(na.rm = T, aes(col = BBLID)) +
#   geom_abline(aes(intercept = Intercept, 
#                   slope = Slope,
#                   colour = BBLID
#                   ),
#               size = 0.5
#               ) +
#   theme(legend.position = "right")

Adding demographics

fit.delta.3 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + (1 |BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +  
##     (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 69
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.64720 -0.24782 -0.00751  0.16224  1.52153 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.6198   0.7873  
##  Residual             0.2219   0.4710  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept)     0.08304    0.20406 15.63767   0.407   0.6896  
## Sex_1M          0.10920    0.21098 17.69300   0.518   0.6112  
## age_scan        0.08623    0.22518 15.92753   0.383   0.7068  
## base_sops_p    -0.46468    0.21659 19.19893  -2.145   0.0449 *
## sips_diff_days  0.24144    0.16183 13.04497   1.492   0.1595  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sex_1M ag_scn bs_sp_
## Sex_1M       0.098                     
## age_scan    -0.242 -0.046              
## base_sops_p -0.003 -0.331 -0.277       
## sps_dff_dys -0.029 -0.246  0.089  0.303
#Anova(fit.delta.3)

# #plot
# delta3_coefs <- coef(fit.delta.3)$BBLID %>% 
#   rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
#   rownames_to_column("BBLID")
# 
# model.end.3.data <- left_join(final.bin_scaled, delta3_coefs, by="BBLID")
# 
# ggplot(data = model.end.3.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
#   geom_point(na.rm = T, aes(col = BBLID)) +
#   geom_abline(aes(intercept = Intercept, 
#                   slope = Slope,
#                   colour = BBLID
#                   ),
#               size = 0.5
#               ) +
#   theme(legend.position = "right")

Adding in CEST values along with scanner, since Thalamic glucest contrast varies significantly between scanner.

fit.delta.4 <- lmer(delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + (1 |BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan +  
##     base_sops_p + sips_diff_days + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 66.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.62261 -0.23859 -0.05075  0.07283  1.62260 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.5497   0.7414  
##  Residual             0.2430   0.4930  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                 Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept)     -0.26974    0.30711 15.35941  -0.878   0.3933  
## R_Thalamus_mean -0.08515    0.27964 14.38930  -0.304   0.7651  
## scannerTerra     0.85717    0.59174 17.87691   1.449   0.1648  
## Sex_1M           0.11436    0.20557 15.12336   0.556   0.5861  
## age_scan        -0.04455    0.24164 14.01050  -0.184   0.8564  
## base_sops_p     -0.35572    0.22190 18.48650  -1.603   0.1259  
## sips_diff_days   0.37532    0.18367  9.28487   2.043   0.0704 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_sp_
## R_Thalms_mn  0.440                                   
## scannerTerr -0.766 -0.614                            
## Sex_1M       0.072  0.083 -0.019                     
## age_scan     0.016 -0.205 -0.178 -0.072              
## base_sops_p -0.233 -0.110  0.296 -0.315 -0.324       
## sps_dff_dys -0.341 -0.114  0.414 -0.219 -0.069  0.407
#Anova(fit.delta.4)

#plot
delta4_coefs <- coef(fit.delta.4)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
  rownames_to_column("BBLID")

model.end.4.data <- left_join(final.end_scaled, delta4_coefs, by="BBLID")

ggplot(data = model.end.4.data, mapping = aes(x = sips_diff_days, y = delta_sops)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right")

Visualizing relationship with 3D plot

# # Compute the linear regression (z = ax + by + d)
# fit.3d <- lm(final.bin_scaled$delta_sops ~ final.bin_scaled$sips_diff_days + final.bin_scaled$R_Thalamus_mean)
# # predict values on regular xy grid
# grid.lines = 26
# x.pred <- seq(min(final.bin_scaled$sips_diff_days), max(final.bin_scaled$sips_diff_days), length.out = grid.lines)
# y.pred <- seq(min(final.bin_scaled$R_Thalamus_mean), max(final.bin_scaled$R_Thalamus_mean), length.out = grid.lines)
# xy <- expand.grid( x = x.pred, y = y.pred)
# z.pred <- matrix(predict(fit.3d, newdata = xy), 
#                  nrow = grid.lines, ncol = grid.lines)
# # fitted points for droplines to surface
# fitpoints <- predict(fit.3d)

scatter3D(final.end_scaled$sips_diff_days, final.end_scaled$R_Thalamus_mean, final.end_scaled$delta_sops, colvar = final.end_scaled$R_Thalamus_mean, bty = "b2", col = NULL, add = FALSE, theta = 110, phi = 20, main = "", xlab = "Time", ylab ="Thalamic CEST", zlab = "Change in SOPS P")

plotrgl()

Given the small sample size, it’s risky to add more predictors. However, we’ll try just adding the indicator for lifetime psych medication at time of scan.

fit.delta.5 <- lmer(delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 |BBLID), REML = T, data = final.end_scaled )
summary(fit.delta.5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan +  
##     base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 59.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.72014 -0.36236 -0.04593  0.30946  1.39239 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.3177   0.5636  
##  Residual             0.2395   0.4894  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                  Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)     -0.518725   0.269563 12.817737  -1.924  0.07680 . 
## R_Thalamus_mean -0.124063   0.233369 13.341957  -0.532  0.60373   
## scannerTerra     0.798900   0.504606 17.040425   1.583  0.13175   
## Sex_1M           0.186266   0.174901 14.196357   1.065  0.30467   
## age_scan         0.007742   0.201728 13.001764   0.038  0.96997   
## base_sops_p     -0.810457   0.255229 16.701783  -3.175  0.00563 **
## sips_diff_days   0.372218   0.172108 11.663537   2.163  0.05209 . 
## base_rx1         1.628971   0.591683 14.060745   2.753  0.01550 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_sp_ sps_d_
## R_Thalms_mn  0.440                                          
## scannerTerr -0.714 -0.602                                   
## Sex_1M       0.026  0.089 -0.058                            
## age_scan     0.013 -0.207 -0.198 -0.044                     
## base_sops_p  0.018 -0.050  0.291 -0.364 -0.309              
## sps_dff_dys -0.334 -0.127  0.458 -0.251 -0.089  0.385       
## base_rx1    -0.304 -0.062 -0.071  0.184  0.096 -0.669 -0.080
#Anova(fit.delta.5)

Finally, I’ll fit a full model without CEST data for comparison.

fit.delta.6 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 |BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.6)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +  
##     base_rx + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 61.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.69054 -0.30913 -0.08562  0.29935  1.43460 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.3645   0.6037  
##  Residual             0.2273   0.4768  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)     -0.2132     0.1959 13.3378  -1.089  0.29560   
## Sex_1M           0.1991     0.1798 16.0388   1.107  0.28451   
## age_scan         0.1123     0.1864 13.8319   0.603  0.55638   
## base_sops_p     -0.9478     0.2482 16.8593  -3.818  0.00139 **
## sips_diff_days   0.2307     0.1502 15.6798   1.536  0.14441   
## base_rx1         1.7329     0.6040 15.8504   2.869  0.01122 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sex_1M ag_scn bs_sp_ sps_d_
## Sex_1M      -0.021                            
## age_scan    -0.206 -0.034                     
## base_sops_p  0.341 -0.377 -0.225              
## sps_dff_dys -0.015 -0.268  0.098  0.263       
## base_rx1    -0.510  0.191  0.034 -0.674 -0.024
#Anova(fit.delta.6)

Using anova to compare nested model fits:

anova(fit.delta.5, fit.delta.6, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.end_scaled
## Models:
## fit.delta.6: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## fit.delta.5: delta_sops ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## fit.delta.6    8 68.276 78.643 -26.138   52.276                     
## fit.delta.5   10 68.285 81.243 -24.142   48.285 3.9912  2     0.1359

There is no significant difference between fit.delta.5 and fit.delta.6, suggesting that adding thalamic CEST values do not improve the model.

“The residual variance estimate can be thought of as the within groups variance and each random effect variance estimate can be thought of as a between groups estimate”

Diagnostics

#linearity - plotting residuals
plot(fit.delta.6)

#normality - plotting quantiles
qqnorm(residuals(fit.delta.6))

Heteroscedastic - model fits less well at higher (and more important) sops_p score changes; adding base_dx to try to improve fit.

fit.delta.7 <- lmer(delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + base_dx + (1|BBLID), REML = T, data = final.end_scaled)
summary(fit.delta.7)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days +  
##     base_rx + base_dx + (1 | BBLID)
##    Data: final.end_scaled
## 
## REML criterion at convergence: 55.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6842 -0.2726 -0.1425  0.2651  1.5620 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.4259   0.6526  
##  Residual             0.2139   0.4625  
## Number of obs: 27, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)   
## (Intercept)    -0.47328    0.37352 14.33122  -1.267  0.22533   
## Sex_1M          0.15545    0.20455 14.91548   0.760  0.45912   
## age_scan        0.04311    0.22122 11.93979   0.195  0.84876   
## base_sops_p    -0.95435    0.29883 14.17117  -3.194  0.00642 **
## sips_diff_days  0.27940    0.16038 12.47019   1.742  0.10607   
## base_rx1        2.28260    0.86868 10.73064   2.628  0.02394 * 
## base_dxother    0.15209    1.01056 14.34766   0.150  0.88247   
## base_dxpro      0.56908    0.49614 14.35397   1.147  0.27014   
## base_dxpsych   -0.20011    0.92859 11.49886  -0.216  0.83315   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sex_1M ag_scn bs_sp_ sps_d_ bs_rx1 bs_dxt bs_dxpr
## Sex_1M       0.264                                                  
## age_scan     0.091  0.137                                           
## base_sops_p  0.362 -0.308 -0.298                                    
## sps_dff_dys -0.284 -0.350 -0.033  0.162                             
## base_rx1    -0.076  0.166 -0.062 -0.225 -0.042                      
## base_dxothr -0.340 -0.349 -0.440  0.255  0.203 -0.032               
## base_dxpro  -0.799 -0.309 -0.266 -0.169  0.324  0.052  0.375        
## bas_dxpsych -0.569 -0.183  0.016 -0.383  0.191 -0.589  0.131  0.442
#Anova(fit.delta.7)

anova(fit.delta.6, fit.delta.7, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.end_scaled
## Models:
## fit.delta.6: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + (1 | BBLID)
## fit.delta.7: delta_sops ~ Sex_1M + age_scan + base_sops_p + sips_diff_days + base_rx + base_dx + (1 | BBLID)
##             npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## fit.delta.6    8 68.276 78.643 -26.138   52.276                     
## fit.delta.7   11 71.236 85.490 -24.618   49.236 3.0401  3     0.3855

Baseline dx does not appear to add to the fit of the model.

Progressors vs Remitters

Classifying each subject as progression vs remission based on whether trajectory from baseline to final endpoint SOPS-P is + or -, looking for between-group CEST differences (similar to Allen et al Fig 3A & binary logistic regression run in Egerton).

#first subset df to final tp for each subject
end.df <- final.df %>%
  group_by(BBLID) %>%
  slice_tail() %>%
  ungroup() %>%
  mutate(sops_worse = case_when(sops_cat == "remit" ~ 0,
                                sops_cat == "stable" ~ 0,
                                sops_cat == "progress" ~ 1))

table(end.df$sops_worse) #pretty even split
## 
##  0  1 
## 10 11
fit.stat <- glm(sops_worse ~ R_Thalamus_mean + scanner, data=end.df, family=binomial(logit))
Anova(fit.stat)
## Analysis of Deviance Table (Type II tests)
## 
## Response: sops_worse
##                 LR Chisq Df Pr(>Chisq)
## R_Thalamus_mean  0.59186  1     0.4417
## scanner          1.02959  1     0.3103
#confirm that these results don't change when removing controls
end.df.pts <- end.df %>%
  filter(base_dx!= "none")
fit.stat.pt <- glm(sops_worse ~ R_Thalamus_mean + scanner, data=end.df.pts, family=binomial(logit))
Anova(fit.stat.pt)
## Analysis of Deviance Table (Type II tests)
## 
## Response: sops_worse
##                 LR Chisq Df Pr(>Chisq)
## R_Thalamus_mean  0.88887  1     0.3458
## scanner          1.02995  1     0.3102

Thalamic glutamate levels do not differ between remitters and progressors in our sample, whether or not controls are included in the sample.

#controlling for clinical factors
fit.stat2 <- glm(sops_worse ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p, data=end.df, family=binomial(logit))
Anova(fit.stat2)
## Analysis of Deviance Table (Type II tests)
## 
## Response: sops_worse
##                 LR Chisq Df Pr(>Chisq)  
## R_Thalamus_mean  0.18528  1    0.66688  
## scanner          0.69070  1    0.40592  
## Sex_1M           0.13451  1    0.71380  
## age_scan         0.02521  1    0.87383  
## base_sops_p      2.83737  1    0.09209 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#clinical factors only
fit.stat3 <- glm(sops_worse ~ Sex_1M + age_scan + base_sops_p + base_rx + base_rx, data=end.df, family=binomial(logit))
Anova(fit.stat3)
## Analysis of Deviance Table (Type II tests)
## 
## Response: sops_worse
##             LR Chisq Df Pr(>Chisq)   
## Sex_1M        0.3231  1   0.569751   
## age_scan      0.0029  1   0.956901   
## base_sops_p   6.7927  1   0.009153 **
## base_rx       3.5063  1   0.061136 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.stat3)
## 
## Call:
## glm(formula = sops_worse ~ Sex_1M + age_scan + base_sops_p + 
##     base_rx + base_rx, family = binomial(logit), data = end.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8130  -0.6865   0.3842   0.9616   1.5516  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.06918    4.65601  -0.015    0.988  
## Sex_1M       0.64420    1.14271   0.564    0.573  
## age_scan     0.01214    0.22480   0.054    0.957  
## base_sops_p -0.28803    0.13881  -2.075    0.038 *
## base_rx1     3.77544    2.48210   1.521    0.128  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 29.065  on 20  degrees of freedom
## Residual deviance: 21.761  on 16  degrees of freedom
## AIC: 31.761
## 
## Number of Fisher Scoring iterations: 4

Running partial correlations (as in Egerton et al) on baseline glutamate vs SOPS P (“Relationships between baseline glutamate concentrations and absolute change in attenuated psychotic symptoms or GAF score over time were calculated using partial correlation coefficients, co-varying for baselines cores to control for regression to the mean”)

pcor.df <- end.df %>%
  mutate(scan.num = case_when(scanner == "Terra" ~ 0,
                                scanner == "ONM" ~ 1)) %>% #must be numeric for pcor
  dplyr::select("base_sops_p", "sops_p", "sips_diff_days", "R_Thalamus_mean", "scan.num")

pcor(pcor.df, method="pearson")
## $estimate
##                 base_sops_p     sops_p sips_diff_days R_Thalamus_mean
## base_sops_p       1.0000000  0.7087387     -0.6621693       0.2202522
## sops_p            0.7087387  1.0000000      0.3791406      -0.1104574
## sips_diff_days   -0.6621693  0.3791406      1.0000000       0.1760602
## R_Thalamus_mean   0.2202522 -0.1104574      0.1760602       1.0000000
## scan.num          0.5858204 -0.3660444      0.7327678      -0.5921479
##                   scan.num
## base_sops_p      0.5858204
## sops_p          -0.3660444
## sips_diff_days   0.7327678
## R_Thalamus_mean -0.5921479
## scan.num         1.0000000
## 
## $p.value
##                  base_sops_p       sops_p sips_diff_days R_Thalamus_mean
## base_sops_p     0.0000000000 0.0009926069   0.0027546090     0.379823145
## sops_p          0.0009926069 0.0000000000   0.1207399309     0.662599508
## sips_diff_days  0.0027546090 0.1207399309   0.0000000000     0.484661945
## R_Thalamus_mean 0.3798231454 0.6625995082   0.4846619451     0.000000000
## scan.num        0.0106297365 0.1351960432   0.0005422734     0.009621919
##                     scan.num
## base_sops_p     0.0106297365
## sops_p          0.1351960432
## sips_diff_days  0.0005422734
## R_Thalamus_mean 0.0096219190
## scan.num        0.0000000000
## 
## $statistic
##                 base_sops_p    sops_p sips_diff_days R_Thalamus_mean  scan.num
## base_sops_p       0.0000000  4.018527     -3.5346130       0.9031884  2.891366
## sops_p            4.0185273  0.000000      1.6389263      -0.4445500 -1.573374
## sips_diff_days   -3.5346130  1.638926      0.0000000       0.7154161  4.307366
## R_Thalamus_mean   0.9031884 -0.444550      0.7154161       0.0000000 -2.939322
## scan.num          2.8913662 -1.573374      4.3073659      -2.9393216  0.000000
## 
## $n
## [1] 21
## 
## $gp
## [1] 3
## 
## $method
## [1] "pearson"

No significant association of thalamic glutamate with final SOPS-P scores is found when controlling for baseline SOPS, time to endpoint, and scanner.

Appendix

References

MEM Resources

tim=rep(10:19,10)
id=rep(1:10,each=10)
rand.int=rep(rnorm(10,0,1),each=10)
rand.slop=rep(rnorm(10,0,1),each=10)
e=rnorm(100,0,0.5)

y1=10+rand.int+tim+e
y2=10+rand.int+tim+e
y3=10+rand.int+tim+rand.slop*tim+e

reg1=lmer(y1~tim+(tim|id))
summary(reg1)

reg2=lmer(y2~tim+(tim|id))
summary(reg2)

#plot
reg2_coef <- coef(reg2)$id %>% 
  rename(Intercept = `(Intercept)`, Slope = tim) %>% 
  rownames_to_column("id")
y2.data <- data.frame(y2, tim, id) %>%
  mutate(id=as.character(id))
reg2.data <- left_join(y2.data, reg2_coef, by="id")

ggplot(data = reg2.data, mapping = aes(x = tim, y = y2)) +
  geom_point(na.rm = T, aes(col = id)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = id
                  ),
              size = 0.5
              )

reg2a=lmer(y2~tim+(1|id))
summary(reg2a)

#plot
reg2a_coef <- coef(reg2a)$id %>% 
  rename(Intercept = `(Intercept)`, Slope = tim) %>% 
  rownames_to_column("id")
reg2a.data <- left_join(y2.data, reg2a_coef, by="id")

ggplot(data = reg2a.data, mapping = aes(x = tim, y = y2)) +
  geom_point(na.rm = T, aes(col = id)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = id
                  ),
              size = 0.5
              )

reg2b=lmer(y2~tim+(-1+tim|id))
summary(reg2b)

#plot
reg2b_coef <- coef(reg2b)$id %>% 
  rename(Intercept = `(Intercept)`, Slope = tim) %>% 
  rownames_to_column("id")
reg2b.data <- left_join(y2.data, reg2b_coef, by="id")

ggplot(data = reg2b.data, mapping = aes(x = tim, y = y2)) +
  geom_point(na.rm = T, aes(col = id)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = id
                  ),
              size = 0.5
              )

reg3=lmer(y3~tim+(tim|id))
summary(reg3)

Mixed Effects Models - Baseline Included

Using scaled data so model will converge more readily. Building models with predictors chosen based on a priori subject knowledge and relationships shown above, getting sequentially more complex to avoid overfitting. Building models inclusive of baseline scores will assess for relationships between thalamic cest and SOPS-P, allowing that relationship to vary linearly over time (random slope for sips_diff_days). However, because the primary interest of the study is change in SOPS P sxs, we want baseline SOPS to be a predictor of final SOPS (or change in SOPS) not predicted necessarily. Basically, i’m not hypothesizing that the relationship between sops and cest will vary linearly over time by individual (though thats’ maybe interesting?); more directly i want to know if later SOPS are related to CEST.

lmer.1 <- lmer(sops_p ~ sips_diff_days + (1|BBLID), REML = T, data = final.df_scaled)
summary(lmer.1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ sips_diff_days + (1 | BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 123.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1337 -0.4686 -0.1190  0.2692  2.3313 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  BBLID    (Intercept) 0.7948   0.8915  
##  Residual             0.3271   0.5720  
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)
## (Intercept)     0.05839    0.21220 19.15213   0.275    0.786
## sips_diff_days  0.15460    0.09762 28.91527   1.584    0.124
## 
## Correlation of Fixed Effects:
##             (Intr)
## sps_dff_dys 0.022
#plot
model_coefs <- coef(lmer.1)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
  rownames_to_column("BBLID")

model1.data <- left_join(final.df_scaled, model_coefs, by="BBLID")

ggplot(data = model1.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right")

lmer.2 <- lmer(sops_p ~ base_dx + (1 + sips_diff_yrs|BBLID), REML = T, data = final.df_scaled )
summary(lmer.2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ base_dx + (1 + sips_diff_yrs | BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 104.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3310 -0.5417 -0.1086  0.4680  2.1843 
## 
## Random effects:
##  Groups   Name          Variance Std.Dev. Corr
##  BBLID    (Intercept)   0.28689  0.5356       
##           sips_diff_yrs 0.05695  0.2386   0.28
##  Residual               0.29841  0.5463       
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##               Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)  -0.674097   0.250865 15.607479  -2.687 0.016456 *  
## base_dxother  0.009397   0.700487 14.511855   0.013 0.989479    
## base_dxpro    0.599953   0.338429 15.604001   1.773 0.095786 .  
## base_dxpsych  1.869096   0.380732 16.312411   4.909 0.000149 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) bs_dxt bs_dxpr
## base_dxothr -0.358               
## base_dxpro  -0.741  0.265        
## bas_dxpsych -0.659  0.236  0.488
#plot
model_coefs2 <- coef(lmer.2)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_yrs) %>% 
  rownames_to_column("BBLID")
model2.data <- left_join(final.df_scaled, model_coefs2, by="BBLID")

ggplot(data = model2.data, mapping = aes(x = sips_diff_yrs, y = sops_p)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right") +
  ggtitle("Model on Scaled Numeric Data")

Simple random slopes model lmer(sops_p ~ (1 + sips_diff_yrs|BBLID), REML = T, data = final.df_scaled) singular with correlation of -1, because ppts with higher intercepts have more negative slopes (i.e. worse baseline sxs -> generally more improvement). Adding base_dx term resolves this singularity (i.e. shakes up this colinearity) by (it seems) accounting for intercept values outside of the random effect of BBLID.

Adding standard demographics (sex and age at scan)

lmer.3 <- lmer(sops_p ~ Sex_1M + age_scan + base_dx + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.3)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ Sex_1M + age_scan + base_dx + (1 + sips_diff_days |  
##     BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 105.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.39393 -0.54091 -0.05436  0.51024  2.51830 
## 
## Random effects:
##  Groups   Name           Variance Std.Dev. Corr
##  BBLID    (Intercept)    0.27950  0.5287       
##           sips_diff_days 0.03395  0.1843   0.80
##  Residual                0.31824  0.5641       
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##              Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)   -0.4764     0.2766 16.6768  -1.722  0.10354   
## Sex_1M         0.1708     0.1471 15.2497   1.161  0.26348   
## age_scan       0.2612     0.1641 14.2664   1.592  0.13336   
## base_dxother  -0.7168     0.7698 15.5478  -0.931  0.36605   
## base_dxpro     0.2566     0.3731 15.6579   0.688  0.50169   
## base_dxpsych   1.5470     0.4155 15.5293   3.723  0.00194 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sex_1M ag_scn bs_dxt bs_dxpr
## Sex_1M       0.346                             
## age_scan     0.288  0.044                      
## base_dxothr -0.486 -0.264 -0.395               
## base_dxpro  -0.801 -0.312 -0.364  0.432        
## bas_dxpsych -0.742 -0.320 -0.363  0.411  0.620
#Anova(lmer.3)

#plot

model_coefs3 <- coef(lmer.3)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
  rownames_to_column("BBLID")

model3.data <- left_join(final.df_scaled, model_coefs3, by="BBLID")

ggplot(data = model3.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right") +
  ggtitle("Model on Scaled Numeric Data")

Adding in CEST values along with scanner, since Thalamic glucest contrast varies significantly between scanner.

lmer.4 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.4)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx +  
##     (1 + sips_diff_days | BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 106.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.36282 -0.50539 -0.07076  0.45835  2.41745 
## 
## Random effects:
##  Groups   Name           Variance Std.Dev. Corr
##  BBLID    (Intercept)    0.31860  0.5644       
##           sips_diff_days 0.03786  0.1946   0.77
##  Residual                0.31720  0.5632       
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##                 Estimate Std. Error      df t value Pr(>|t|)   
## (Intercept)      -0.5705     0.3112 13.7974  -1.833  0.08843 . 
## R_Thalamus_mean  -0.1447     0.2238 12.3298  -0.646  0.52998   
## scannerTerra      0.3513     0.4600 12.3547   0.764  0.45933   
## Sex_1M            0.2033     0.1611 12.7517   1.262  0.22970   
## age_scan          0.2906     0.1901 12.6545   1.529  0.15090   
## base_dxother     -1.0043     0.8835 12.7599  -1.137  0.27653   
## base_dxpro        0.1290     0.4249 12.7810   0.304  0.76636   
## base_dxpsych      1.5190     0.4357 12.8304   3.487  0.00409 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_dxt bs_dxpr
## R_Thalms_mn  0.212                                           
## scannerTerr -0.373 -0.713                                    
## Sex_1M       0.186 -0.147  0.295                             
## age_scan     0.226 -0.379  0.121  0.035                      
## base_dxothr -0.260  0.356 -0.411 -0.341 -0.410               
## base_dxpro  -0.534  0.338 -0.397 -0.380 -0.381  0.528        
## bas_dxpsych -0.640  0.093 -0.112 -0.332 -0.345  0.418  0.608
Anova(lmer.4)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: sops_p
##                   Chisq Df Pr(>Chisq)    
## R_Thalamus_mean  0.4176  1     0.5181    
## scanner          0.5834  1     0.4450    
## Sex_1M           1.5915  1     0.2071    
## age_scan         2.3375  1     0.1263    
## base_dx         21.9961  3  6.535e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot
model_coefs4 <- coef(lmer.4)$BBLID %>% 
  rename(Intercept = `(Intercept)`, Slope = sips_diff_days) %>% 
  rownames_to_column("BBLID")
model4.data <- left_join(final.df_scaled, model_coefs4, by="BBLID")
ggplot(data = model4.data, mapping = aes(x = sips_diff_days, y = sops_p)) +
  geom_point(na.rm = T, aes(col = BBLID)) +
  geom_abline(aes(intercept = Intercept, 
                  slope = Slope,
                  colour = BBLID
                  ),
              size = 0.5
              ) +
  theme(legend.position = "right") +
  ggtitle("Model on Scaled Numeric Data")

Given the small sample size, it’s risky to add more predictors. However, we’ll try just adding the lifetime psych medication indicator.

lmer.5 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.5)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx +  
##     psych.rx_life + (1 + sips_diff_days | BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 101.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.25643 -0.53706 -0.07254  0.55213  2.08859 
## 
## Random effects:
##  Groups   Name           Variance Std.Dev. Corr
##  BBLID    (Intercept)    0.17781  0.4217       
##           sips_diff_days 0.04767  0.2183   0.26
##  Residual                0.31214  0.5587       
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##                 Estimate Std. Error       df t value Pr(>|t|)  
## (Intercept)     -0.49089    0.26400 11.29953  -1.859   0.0892 .
## R_Thalamus_mean -0.07546    0.19730 11.59945  -0.382   0.7090  
## scannerTerra     0.15601    0.40819 11.73509   0.382   0.7091  
## Sex_1M           0.21152    0.14110 12.14876   1.499   0.1594  
## age_scan         0.20002    0.16522 11.18974   1.211   0.2510  
## base_dxother    -1.25001    0.78355 11.55364  -1.595   0.1376  
## base_dxpro       0.23577    0.36901 11.26722   0.639   0.5357  
## base_dxpsych     0.58626    0.53476 14.20238   1.096   0.2912  
## psych.rx_life1   1.18544    0.50345 21.21613   2.355   0.0282 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) R_Thl_ scnnrT Sex_1M ag_scn bs_dxt bs_dxpr bs_dxps
## R_Thalms_mn  0.212                                                   
## scannerTerr -0.364 -0.708                                            
## Sex_1M       0.198 -0.115  0.270                                     
## age_scan     0.237 -0.369  0.117  0.033                              
## base_dxothr -0.271  0.316 -0.364 -0.340 -0.369                       
## base_dxpro  -0.522  0.340 -0.415 -0.379 -0.371  0.495                
## bas_dxpsych -0.493 -0.012  0.043 -0.265 -0.159  0.431  0.360         
## psych.rx_l1  0.062  0.107 -0.174  0.038 -0.114 -0.207  0.082  -0.704
Anova(lmer.5)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: sops_p
##                  Chisq Df Pr(>Chisq)  
## R_Thalamus_mean 0.1463  1    0.70211  
## scanner         0.1461  1    0.70230  
## Sex_1M          2.2472  1    0.13386  
## age_scan        1.4655  1    0.22606  
## base_dx         8.1330  3    0.04334 *
## psych.rx_life   5.5444  1    0.01854 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even though psych.rx_life is very predictive of sops_p at a given timepoint, is not so much a predictor of interest (since if you know whether someone is taking psychiatric meds you probably also know their positive sxs); it’s included because we want to control for it when testing the significance of the other predictors (because meds may affect glucest contrast and/or sops positive score).

Switching out baseline sops positive (base_sops_p) for base_dx results in a singular fit.

lmer.6 <- lmer(sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_sops_p + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
## boundary (singular) fit: see ?isSingular
# summary(lmer.6)
# Anova(lmer.6)

Finally, I’ll fit a full model without CEST data for comparison.

lmer.7 <- lmer(sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days|BBLID), REML = T, data = final.df_scaled )
summary(lmer.7)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days |  
##     BBLID)
##    Data: final.df_scaled
## 
## REML criterion at convergence: 99.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.24856 -0.56892 -0.06649  0.58064  2.15616 
## 
## Random effects:
##  Groups   Name           Variance Std.Dev. Corr
##  BBLID    (Intercept)    0.1327   0.3643       
##           sips_diff_days 0.0442   0.2102   0.23
##  Residual                0.3153   0.5616       
## Number of obs: 48, groups:  BBLID, 21
## 
## Fixed effects:
##                Estimate Std. Error      df t value Pr(>|t|)  
## (Intercept)     -0.4490     0.2287 13.4203  -1.963   0.0707 .
## Sex_1M           0.1993     0.1263 14.6326   1.578   0.1359  
## age_scan         0.1797     0.1391 12.3103   1.292   0.2200  
## base_dxother    -1.1620     0.6818 14.1460  -1.704   0.1102  
## base_dxpro       0.2917     0.3116 12.9301   0.936   0.3663  
## base_dxpsych     0.5403     0.5003 15.2687   1.080   0.2969  
## psych.rx_life1   1.2589     0.4699 21.3073   2.679   0.0139 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) Sex_1M ag_scn bs_dxt bs_dxpr bs_dxps
## Sex_1M       0.342                                     
## age_scan     0.305  0.057                              
## base_dxothr -0.460 -0.285 -0.344                       
## base_dxpro  -0.794 -0.320 -0.354  0.399                
## bas_dxpsych -0.510 -0.295 -0.166  0.486  0.414         
## psych.rx_l1 -0.003  0.095 -0.118 -0.306  0.011  -0.710
Anova(lmer.7)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: sops_p
##                Chisq Df Pr(>Chisq)   
## Sex_1M        2.4904  1   0.114543   
## age_scan      1.6696  1   0.196316   
## base_dx       9.1369  3   0.027525 * 
## psych.rx_life 7.1766  1   0.007386 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using anova to compare nested model fits:

anova(lmer.4, lmer.5, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.df_scaled
## Models:
## lmer.4: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + (1 + sips_diff_days | BBLID)
## lmer.5: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
##        npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)   
## lmer.4   12 121.10 143.55 -48.549   97.098                        
## lmer.5   13 116.02 140.35 -45.011   90.022 7.0754  1   0.007815 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lmer.5, lmer.7, test="Chisq")
## refitting model(s) with ML (instead of REML)
## Data: final.df_scaled
## Models:
## lmer.7: sops_p ~ Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
## lmer.5: sops_p ~ R_Thalamus_mean + scanner + Sex_1M + age_scan + base_dx + psych.rx_life + (1 + sips_diff_days | BBLID)
##        npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## lmer.7   11 112.44 133.02 -45.219   90.437                     
## lmer.5   13 116.02 140.35 -45.011   90.022 0.4151  2     0.8126

lmer.5 (adding psych.rx_life) unsurprisingly significantly improved the fit significantly. There was no significant difference between lmer.5 and lmer.7, suggesting that adding thalamic CEST values do not improve the model.

“The residual variance estimate can be thought of as the within groups variance and each random effect variance estimate can be thought of as a between groups estimate”

Diagnostics

#linearity - plotting residuals
plot(lmer.5)

#normality - plotting quantiles
qqnorm(residuals(lmer.5))

Heteroscedastic - model fits less well at higher (and more important) sops_p scores

Calculating % Change

Removed because it resulted in irritating and uninterpretable inf values.

sops_change <- sops_change %>% 
  group_by(bblid_scan) %>%
  arrange(scan.time) %>%
  mutate(pct.delta.sops_p = abs.delta.sops_p/first(sops_p)  * 100,
         pct.delta.sops_n = abs.delta.sops_n/first(sops_n)  * 100,
         pct.delta.sops_d = abs.delta.sops_d/first(sops_d)  * 100,
         pct.delta.sops_g = abs.delta.sops_g/first(sops_g)  * 100,
         pct.delta.sops_tot = abs.delta.sops_tot/first(sops_tot)  * 100,
         pct.delta.gacf = abs.delta.gafc/first(GAF_C)  * 100)

Unused Med Dataframes

Merge and keep med info at time of scan as well as time of each SOPS endpoint assessment.

sops_change_meds <- left_join(sops_change_dx, medswide, by="BBLID")

#calculate datediff from meds to scan/SIPS
sops_change_meds <- sops_change_meds %>%
  mutate(meds_7t_diff = as.numeric(difftime(DOCOLLECT, date7t, units = "days")),
         meds_sops_diff = as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days")))
#ok, now pull the baseline meds and set them aside for now
meds_baseline <- sops_change_meds %>%
  arrange(abs(meds_7t_diff)) %>%
  group_by(observation) %>%
  slice_head() %>% #keep only closest dx
  ungroup() %>%
 dplyr::select(observation, meds_7t_diff, DOCOLLECT | contains("MEDICINE_") | contains("NOTES_") | contains("IS_CURRENT_")) %>% #drop non-med stuff
 dplyr::select_if(~!all(is.na(.))) %>% #drop any col that's totally empty
  rename_with(.cols = 3:ncol(.), function(x){paste0("base.", x)}) #add prefix
#go back and filter down to endpoint meds
meds_end <- sops_change_meds %>%
  arrange(abs(meds_sops_diff)) %>%
  group_by(observation) %>%
  slice_head() %>% #keep only closest dx
  ungroup() %>%
 dplyr::select(observation, meds_sops_diff, DOCOLLECT | contains("MEDICINE_") | contains("NOTES_") | contains("IS_CURRENT_")) %>% #drop non-med stuff
 dplyr::select_if(~!all(is.na(.)))

#plots to see the range of closest med info
sops.med.p <- meds_end %>%
  ggplot() + 
    geom_histogram(aes(x = meds_sops_diff), position="identity", color="lightblue", fill="lightblue")
scan.med.p <- meds_baseline %>%
  ggplot() + 
    geom_histogram(aes(x = meds_7t_diff), position="identity", color="lightgreen", fill="lightgreen")
# put p1 and p2 together
ggarrange(sops.med.p, scan.med.p, common.legend = T, legend = "bottom", ncol = 2)

Ok, maybe let’s restrict to +- 100 days and add back in. Also tried 200 days but didn’t add anything

#filter down
meds_baseline <- meds_baseline %>%
  filter(abs(meds_7t_diff) <= 100) %>%
  mutate(across(c(contains("MEDICINE_")), factor)) #also setting meds as factors to see frequency later
meds_end <- meds_end %>%
  filter(abs(meds_sops_diff) <= 100) %>%
  mutate(across(c(contains("MEDICINE_")), factor)) #also setting meds as factors to see frequency later


#merge base and end meds together
meds_filt <- full_join(meds_baseline, meds_end, by="observation")
sum(is.na(meds_filt$meds_7t_diff))
sum(is.na(meds_filt$meds_sops_diff))

#merging meds with sops_change_dx b/c sops_change_meds already has med stuff
sops_change_meds.final <- left_join(sops_change_dx, meds_filt, by="observation") 

#make sure no obs lost
n_unique(sops_change_meds.final$observation)
#skim(sops_change_meds.final)

Ok, now how to summarize all this medication info into one cohesive value

#see how often each med is reported
sops_change_meds.final %>%
 dplyr::select(contains("MEDICINE")) %>%
  unlist() %>%
  table()

NOTES: CARVEDILOL = beta blocker (anxiety?) CLONIDINE = hypertension, ADHD? DEXMETHYLPHENIDATE HYDROCHLORIDE = ADHD NORETHINDRONE ACETATE = progestin RANITIDINE = antihistamine/antacid TRILOMARZIA = birth control ALBUTEROL = asthma DEXEDRINE = ADHD DICLOFENAC = arthritis ADVAIR = asthma

ideas - use 0,1 indicator for each class of medication at each timepoint? could be granular (asthma, allergies, stimulants) or more broad (psych_rx, other_rx) - use that indicator but lifetime?

keep if start/end dates of med are < date > ? lifetime indicator - dont filter by date assessed, just if date-rx-start is < endpoint date?

Test using the DOMED_START and DOMED_END to only capture meds that are explicitly current for 7T scan

#merge
scan_date_med <- inner_join(scan_dates, meds_reduced, by="BBLID")

#ok, keep only rows where med is current to 7T
scan_date_med_current <- scan_date_med %>%
  filter(is.na(DOMED_START) | date7t > DOMED_START) %>% #keep if med is STARTED BEFORE 7t
  filter(is.na(DOMED_START) | date7t < DOMED_END) %>% #keep if med was ENDED AFTER 7t
  mutate(med_7t_diff = as.numeric(difftime(DOCOLLECT,date7t, units = "days"))) %>%
  filter(!(is.na(DOMED_START) & (abs(med_7t_diff) > 100))) %>%  #only keep NAs if DOCOLLECT is w/in 100 days of scan
  group_by(observation) %>%
  mutate(count = row_number()) %>% #add counts so i can pivot after
  ungroup() %>%
  rename_with(.cols = 4:ncol(.), function(x){paste0("base.", x)}) #add prefix
scan_date_med_current$base.NOTES[is.na(scan_date_med_current$base.NOTES)] <- "empty entry"
skim(scan_date_med_current)

#pivot wide
scan_meds_current_wide <- pivot_wider(scan_date_med_current, id_cols=observation, names_from = base.count, values_from=c(4:12))
#just keep essential cols
scan.meds.current_wide.filt <- scan_meds_current_wide %>%
 dplyr::select(observation | contains("MEDICINE_") | contains("NOTES_")) %>%
 dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty

In the meantime I’ll try to find meds current to endpoint SOPs.

#ok, keep only rows where med is current to 7T
end_date_med_current <- end_date_med %>%
  filter(is.na(DOMED_START) | DOSIPS > DOMED_START) %>% #keep if med is STARTED BEFORE SOPS end
  filter(is.na(DOMED_START) | DOSIPS < DOMED_END) %>% #keep if med was ENDED AFTER SOPS end
  mutate(med_sops_diff = as.numeric(difftime(DOCOLLECT, DOSIPS, units = "days"))) %>%
  filter(!(is.na(DOMED_START) & (abs(med_sops_diff) > 100))) %>%  #only keep NAs if DOCOLLECT is w/in 100 days of scan
  group_by(observation) %>%
  mutate(count = row_number()) %>% #add counts so i can pivot after
  ungroup()
end_date_med_current$NOTES[is.na(end_date_med_current$NOTES)] <- "empty entry"
skim(end_date_med_current)

#pivot wide
end_meds_current_wide <- pivot_wider(end_date_med_current, id_cols=observation, names_from = count, values_from=c(4:12))
#just keep essential cols
end.meds.current_wide.filt <- end_meds_current_wide %>%
 dplyr::select(observation | contains("MEDICINE_") | contains("NOTES_")) %>%
 dplyr::select_if(~!all(is.na(.))) #drop any col that's totally empty

This seems to be working, lets try merging baseline and endpoint meds and see what we have

currentmeds <- right_join(scan.meds.current_wide.filt, end.meds.current_wide.filt, by="observation")

Actually very narrow range of meds here, but the empty entry notes indicate either no medications or no new medications. Therefore I’ll need to look back at participant’s former meds

#Wide Data

Operationalizing Change

Ok, now we calculate change from baseline to each endpoint (absolute and slope). Change is defined as final score - baseline score, so a negative value means final<baseline which means sxs improved! (except GAFC, where positive -> sxs improved)

#calculate change
sops_change <- scanlist_clean %>% 
  group_by(bblid_scan) %>%
  arrange(scan.time) %>%
  mutate(abs.delta.sops_p = sops_p - first(sops_p), #change=final-baseline -> negative value means score improves!
         abs.delta.sops_n = sops_n - first(sops_n),
         abs.delta.sops_d = sops_d - first(sops_d),
         abs.delta.sops_g = sops_g - first(sops_g),
         abs.delta.sops_tot = sops_tot - first(sops_tot),
         abs.delta.gafc = GAF_C - first(GAF_C),
         delta.sops_time = as.numeric(difftime(DOSIPS,first(DOSIPS), units = "days")),
         slope.sops_p = abs.delta.sops_p/delta.sops_time,
         slope.sops_n = abs.delta.sops_n/delta.sops_time,
         slope.sops_d = abs.delta.sops_d/delta.sops_time,
         slope.sops_g = abs.delta.sops_g/delta.sops_time,
         slope.sops_tot = abs.delta.sops_tot/delta.sops_time,
         slope.gafc = abs.delta.gafc/delta.sops_time,
         observation = paste(bblid_scan, scan.time)) %>% #add unique ID for each BBLID-scan-endpoint#
  ungroup() %>%
 dplyr::select(!c(sops.item.list, sops_p, sops_n, sops_d, sops_g, sops_tot)) %>% #drop raw scores
  filter(!(scan.time=="base.1"))

#double check to make sure no endpoint scans come before baseline scans 
summary(sops_change$delta.sops_time)
#just one row per BBLID-scan-endpoint#
any(duplicated(sops_change$observation))
n_unique(sops_change$observation)

#fun plots
ggplot(sops_change, aes(x=slope.sops_tot)) + geom_histogram()
ggplot(sops_change, aes(x=slope.sops_n)) + geom_histogram()
ggplot(sops_change, aes(x=slope.gafc)) + geom_histogram()

scanlist_clean %>%
ggplot(aes(x = sips_diff_days, y = sops_tot, color = bblid_scan)) + 
    geom_line() + 
    theme_bw()

linear model for fun

bs.fit <- lm(slope.sops_tot ~ PRODROMAL, data=sops_change)
Anova(bs.fit)